示例#1
0
    def getarticle(self, headline, lines):
        article = Article(headline=headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n", "")
        text = text.replace("  ", " ")
        text = text.replace("\n", " ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(int(result.group(3)), int(result.group(2)),
                            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:

            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.set_property("medium", self.get_medium(medium))

        return article
示例#2
0
文件: bzk_pdf.py 项目: amcat/amcat
    def getarticle(self, headline, lines):
        article = Article(headline = headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n","")
        text = text.replace("  "," ")
        text = text.replace("\n"," ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(
            int(result.group(3)),
            int(result.group(2)),
            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:
            
            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.set_property("medium", self.get_medium(medium))

        return article
示例#3
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = Article(metastring={})
            article.metastring['html'] = div
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.pagenr, article.section = self.get_pagenum(
                    articlepage[0].text)

            article.medium = self.get_medium(
                div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
示例#4
0
文件: bzk_html.py 项目: BBie/amcat
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = Article(metastring=div.text_content())
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content())

            article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article