예제 #1
0
def html_to_article(content, language):
    content = content.strip()
    if not len(content):
        return ''

    config = NewspaperConfig()
    config.language = language

    doc = config.get_parser().fromstring(content.strip())
    if doc is None:
        return ''

    # Split block-level elements with newlines
    for tag in _BLOCKLEVEL_TAGS:
        if tag in _MEANINGLESS_TAGS:
            continue
        for node in doc.xpath('//{}'.format(tag)):
            node.append(etree.Element('br'))
            node.append(etree.Element('br'))

    # Initial cleanup
    cleaner = _NewspaperCleaner(config)
    doc = cleaner.clean(doc)

    # Best node estimation
    extractor = NewspaperExtractor(config)
    top = extractor.calculate_best_node(doc)
    if top is None:
        del doc, cleaner, extractor
        etree.clear_error_log()

        return ''

    top = extractor.post_cleanup(top)

    # Cleanup dummy nodes used for estimation
    for dummy in top.xpath("//p[@newspaper='dummy']"):
        dummy.getparent().remove(dummy)

    # Custom formatting to avoid unnecessary computations
    formatter = NewspaperFormatter(config)
    formatter.top_node = top
    formatter.remove_negativescores_nodes()
    content = formatter.convert_to_html()
    content = str(content).strip()
    content = unescape(content)

    del doc, top, cleaner, extractor, formatter
    etree.clear_error_log()

    return content
예제 #2
0
파일: testy.py 프로젝트: Xirider/META
def fulltext(html, language='en'):
    """Takes article HTML string input and outputs the fulltext
    Input string is decoded via UnicodeDammit if needed
    """

    config = Configuration()
    config.language = language

    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = WithTagOutputFormatter(config)

    doc = config.get_parser().fromstring(html)
    doc = document_cleaner.clean(doc)

    top_node = extractor.calculate_best_node(doc)

    top_node = extractor.post_cleanup(top_node)
    text, article_html = output_formatter.get_formatted(top_node)
    return text, article_html
예제 #3
0
def ProcessArticle(urlStr, domain, htmlStr, cursor):
    config = Configuration()
    extractor = ContentExtractor(config)
    clean_doc = config.get_parser().fromstring(htmlStr)
    title = extractor.get_title(clean_doc)
    authors = extractor.get_authors(clean_doc)
    text = fulltext(htmlStr)

    text_keyws = list(nlp.keywords(text).keys())
    title_keyws = list(nlp.keywords(title).keys())

    keyws = list(set(title_keyws + text_keyws))
    summary_sents = nlp.summarize(title=title,
                                  text=text,
                                  max_sents=config.MAX_SUMMARY_SENT)
    summary = '\n'.join(summary_sents)

    if len(text) == 0:
        OnArticleProcessError(urlStr)
    else:
        StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary,
                        cursor)
예제 #4
0
class ArticleExtractionPipeline(object):
    def __init__(self):
        self.config = Configuration(
        )  # sets meta config for article and parser
        self.parser = self.config.get_parser()  # parser
        self.extractor = ContentExtractor(
            self.config
        )  # extracts info (author, tags, text, etc.) from parsed article
        self.doc_cleaner = DocumentCleaner(
            self.config)  # cleans unwanted tags and nodes from DOM
        self.formatter = OutputFormatter(
            self.config)  # outputs formatted text from parsed xpath nodes

    # right now basically only works for RT
    # params: doc is parsed html from self.parser
    def find_date_from_html(self, doc):
        # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
        candidates = self.parser.getElementsByTag(doc, tag="time")  # add more
        times = []
        for candidate in candidates:
            time_string = candidate.text
            for indicator in ["Edited", "Updated", "Published"]:
                if indicator in time_string:
                    # indicator probably followed by "at" or ":", actual time is after that
                    if "at" in time_string:
                        time_string = time_string.split("at", 1)[1]
                    elif ":" in time_string:
                        time_string = time_string.split(":", 1)[1]
                    break
            time = self.datetime_from_str(time_string)
            if time:
                times.append(time)
        if times:
            return min(times)
        else:
            return None

    def datetime_from_str(self, datetime_string):
        try:
            return date_parser.parse(datetime_string).replace(
                tzinfo=None
            )  # otherwise can't compare naive and (timezone) offset-aware times
        except (ValueError, OverflowError, AttributeError, TypeError):
            return None

    # params: doc is parsed html from self.parser
    # TODO: generalize
    def get_date(self, url, doc):
        raw_date = (
            self.extractor.get_publishing_date(url, doc)
            or  # telesur, africanews
            self.extractor.get_meta_content(doc,
                                            "meta[name='LastModifiedDate']")
            or  # aljazeera, Sun, 07 January 2018 18:36:49 GMT
            self.extractor.get_meta_content(doc, "meta[name='Last-Modified']")
            or  # times of india, Jan 9, 2018, 05:18 IST
            self.extractor.get_meta_content(
                doc, "meta[property='og:updated_time']")
        )  # diplomat, "2018-01-05 23:22:46"
        if raw_date:
            return self.datetime_from_str(raw_date)
        else:
            return self.find_date_from_html(doc)

    # params: date is datetime object
    def recent_article(self, date, max_days_elapsed=3):
        return datetime.datetime.now() - date < datetime.timedelta(
            days=max_days_elapsed)

    def process_item(self, item, spider):
        doc = self.parser.fromstring(item["content"])

        item["title"] = self.extractor.get_title(doc)
        item["description"] = self.extractor.get_meta_description(doc)
        item["keywords"] = (self.extractor.get_meta_content(
            doc, "meta[name='news_keywords']")
                            or self.extractor.get_meta_keywords(doc))
        item["date"] = self.get_date(item["url"], doc)

        # drop item if no date
        if not item["date"] or not self.recent_article(
                item["date"], max_days_elapsed=7
        ):  # or not self.recent_article(item["date"])
            raise DropItem("Missing or invalid date for: {}".format(
                item["title"]))

        # clean:
        clean_doc = self.doc_cleaner.clean(doc)
        top_node = self.extractor.post_cleanup(
            self.extractor.calculate_best_node(clean_doc))
        item["content"] = self.formatter.get_formatted(top_node)[
            0]  # [1] returns html of article

        # drop item if article too short
        if len(item["content"]) < 600:
            raise DropItem("Not enough text: {}".format(item["title"]))

        logging.info("ARTICLE TITLE: {}".format(item["title"]))
        logging.info("\t time: {}".format(item["date"]))
        return item