Пример #1
0
    def get_article_text(self, link):
        logger.debug("Grabbing article %s", link)
        article_html = get_article(link)
        result = {}
        result["raw_html"] = article_html
        article = bs4.BeautifulSoup(article_html)
        if article.body is None:
            return None

        title = article.body.find(class_="article-title")
        result["title"] = title.text.strip()

        author = article.body.find(class_="author")
        if author is not None:
            result["author"] = author.text.strip()
        else:
            result["author"] = None

        subtitle = article.body.find(class_="article-flash")
        if subtitle is not None:
            result["subtitles"] = [subtitle.string.strip()]

        content = article.body.find(class_="art-content")
        if content is None:
            return None
        else:
            result["text"] = u" ".join(content.stripped_strings)
            return result
Пример #2
0
    def get_article(self, link):
        logger.debug("Grabbing article %s", link)

        article_html = get_article(link)
        result = {}
        result["raw_html"] = article_html
        article = bs4.BeautifulSoup(article_html)

        # Try to find the subtitle
        subtitle = article.find('font', size=3, color="#ff8000")
        if subtitle is not None and subtitle.find('b') is not None:
            result["subtitles"] = [subtitle.b.text.strip()]

        author = article.find('div', class_="clanekAVTOR")
        if author is not None:
            result["author"] = author.text.strip()
        else:
            result["author"] = None

        text_container = article.find(id="_xclaimwords_wrapper")
        if text_container is None:
            return None

        # Remove all script tags from text container
        scripts = text_container.findAll('script')
        [script.extract() for script in scripts]
        result["text"] = u" ".join(text_container.stripped_strings)
        return result
Пример #3
0
    def get_article(self, link):
        logger.debug("Grabbing article %s", link)

        article_html = get_article(link.replace("24ur.com", "www.24ur.com"))
        result = {}
        result["raw_html"] = article_html
        tree = etree.fromstring(article_html, etree.HTMLParser())
        summary = tree.xpath('//div[@class="summary"]/p/text()')
        result["subtitles"] = unicode(summary)

        author_texts = tree.xpath("//div[@class='containerLeftSide']/text()")
        author_text = u" ".join(text.strip() for text in author_texts)
        if u"|" in author_text:
            author = author_text[author_text.rfind('|'):]
        else:
            author = None

        result["author"] = author

        # Elaborate way of getting rid of all script tags and other garbage in this HTML. Looking for
        # a better way.
        content = tree.xpath("//div[@id='content']")
        if len(content) == 0:
            return None



        text = re.sub("\s\s+", " ", bs4.BeautifulSoup(lxml.html.tostring(content[0], encoding="utf-8").decode("utf-8")).get_text())
        result["text"] = text
        if u"Preverite vpisani naslov ali uporabite možnost iskanja po naših straneh." in result["text"]:
            return None
        return result
Пример #4
0
    def get_article(self, link):
        logger.debug("Grabbing article %s", link)
        article_html = get_article(link)
        result = {}
        result["raw_html"] = article_html

        tree = etree.fromstring(article_html, etree.HTMLParser())

        result["subtitles"] = [
            text.strip()
            for text in tree.xpath('//article/p[@class="uvod"]/text()')
        ]

        # Sometimes they use bodytext for this
        text = tree.xpath('//article/p[@class="tekst"]//text()')
        if len(text) == 0:
            text = tree.xpath('//article/p[@class="bodytext"]/text()')

        result["text"] = '\n'.join(text)

        author = tree.xpath('//article/p[@class="bodyslika"]/span/text()')
        if len(author) > 0:
            result["author"] = (' '.join(author)).strip()
        else:
            result["author"] = None

        return result
Пример #5
0
    def get_article_text(self, article_id):
        logger.debug("[RTVSlo] Grabbing article ID %s", article_id)
        article_html = get_article(self.RTV_ARTICLE_URL + str(article_id))
        result = {}
        result["raw_html"] = article_html
        article = bs4.BeautifulSoup(article_html)
        result["title"] = article.title.text.strip()

        subtitles = article.find_all("div", class_="subtitle")
        subtitles = [div.text for div in subtitles]
        result["subtitles"] = subtitles

        text_content = article.find_all("p")
        text_content = u"\n".join(
            [u" ".join(p.stripped_strings) for p in text_content])

        result["text"] = text_content
        return result
Пример #6
0
    def get_article_text(self, link):
        logger.debug("Grabbing article %s", link)
        article_html = get_article(link)
        result = {}
        result["raw_html"] = article_html
        article = bs4.BeautifulSoup(article_html)

        title = article.title
        if title is None:
            return None

        result["title"] = title.text.strip()

        subtitle = article.find(id="EXCERPT", text=True)
        if subtitle is None:
            subtitle = article.find(id="EXCERPT_mnenja", text=True)

        if subtitle is not None:
            result["subtitles"] = [subtitle.text.strip()]

        content_item = article.find(id="D_NEWS")
        if content_item is None:
            content_item = article.find(id="D_NEWS_MNENJA")

        author = article.find(class_="d_author")
        if author is not None:
            result["author"] = author.text.strip()
        else:
            result["author"] = None

        if content_item is not None:
            text_content = u" ".join([
                p_item.text.strip()
                for p_item in content_item.find_all('p', text=True)
                if p_item is not None
            ])
            text_content = text_content.replace("  ", " ")
            result["text"] = text_content
            return result
        else:
            logger.warn("Unknown article content for %s", link)
            return None
Пример #7
0
    def get_article_text(self, article_id):
        logger.debug("Grabbing article ID %s", article_id)
        article_html = get_article(self.ZURNAL_PRINT_URL + str(article_id))
        result = {}
        result["raw_html"] = article_html
        article = bs4.BeautifulSoup(article_html)
        article = article.body.find("article")

        result["title"] = article.hgroup.h1.text
        author = article.find(id="meta_el").find(class_="left").text

        try:
            author = author[:author.index('/')].strip()
            result["author"] = author
        except ValueError as e:
            result["author"] = None

        content_div = article.find_all("div", class_="entry")
        result["text"] = u" ".join(content_div[0].stripped_strings)
        return result
Пример #8
0
    def get_article_text(self, link):
        logger.debug("Grabbing article %s", link)
        article_html = get_article(link)
        result = {}
        result["raw_html"] = article_html
        tree = etree.fromstring(article_html, etree.HTMLParser())

        # This is a structure for editorials
        author = None

        try:
            author = tree.xpath('//article[@id="article"]/div')[1].xpath(
                "./text()")[2].strip()
        except:
            try:
                a = tree.xpath('//article[@id="article"]/header/p')[0].xpath(
                    './i/text()')[0].strip()
                if "Avtor:" in a:
                    author = a.replace("Avtor:", "").strip()
            except:
                author = None

        result["author"] = author

        try:
            result["subtitles"] = [
                ' '.join(
                    tree.xpath('//article[@id="article"]/header')[0].xpath(
                        './p')[-1].xpath('./text()')).strip()
            ]
        except IndexError:
            result["subtitles"] = None

        result["text"] = '\n\n'.join([
            ' '.join(x).strip()
            for x in map(lambda x: x.xpath('.//text()'),
                         tree.xpath('//article[@id="article"]/p'))
        ]).strip()

        return result
Пример #9
0
    def parse_article(self, article_url):
        link, data = article_url
        article = {}

        try:
            article_html = get_article(link)
            article["raw_html"] = article_html
        except Exception as e:
            logger.warn("Failed to parse article %s", link, exc_info=True)
            return

        article["text"] = data["text"]
        article["title"] = data["title"]
        article["published"] = data["published"]
        article["source"] = "Val202"
        article["source_url"] = link
        article["language"] = "si"
        article["author"] = data["author"]

        # Generate ID from link
        article["id"] = get_sha_hash(data["guid"])
        return article
Пример #10
0
    def get_article_text(self, link):
        logger.debug("Grabbing article %s", link)
        article_html = get_article(link)
        result = {}
        result["raw_html"] = article_html
        article = bs4.BeautifulSoup(article_html)

        author = article.body.find(class_="article-source")
        if author is not None and author.strong is not None:
            result["author"] = author.strong.text.strip()
        else:
            result["author"] = None

        subtitle = article.body.find('p', class_="intro-box", text=True)
        if subtitle is not None:
            result["subtitles"] = [subtitle.text.strip()]

        content = article.body.article
        if content is None:
            return None
        else:
            result["text"] = u" ".join(content.stripped_strings)
            return result