예제 #1
0
def newspaper_fulltext2(parser, language, url):
    '''
    This is a faster version of the function that uses some internal newspaper3k functions
    so that the lxml parse tree doesn't need to be recreated.
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    doc = extractor.calculate_best_node(doc)
    if doc is not None:
        doc = extractor.post_cleanup(doc)
        text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value': {
            'text': text,
            'html': html,
        },
        'pattern': 'newspaper3k',
    }
예제 #2
0
def newspaper_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    top_node = extractor.calculate_best_node(doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'newspaper3k',
        }
예제 #3
0
def html_to_article(content, language):
    content = content.strip()
    if not len(content):
        return ''

    config = NewspaperConfig()
    config.language = language

    doc = config.get_parser().fromstring(content.strip())
    if doc is None:
        return ''

    # Split block-level elements with newlines
    for tag in _BLOCKLEVEL_TAGS:
        if tag in _MEANINGLESS_TAGS:
            continue
        for node in doc.xpath('//{}'.format(tag)):
            node.append(etree.Element('br'))
            node.append(etree.Element('br'))

    # Initial cleanup
    cleaner = _NewspaperCleaner(config)
    doc = cleaner.clean(doc)

    # Best node estimation
    extractor = NewspaperExtractor(config)
    top = extractor.calculate_best_node(doc)
    if top is None:
        del doc, cleaner, extractor
        etree.clear_error_log()

        return ''

    top = extractor.post_cleanup(top)

    # Cleanup dummy nodes used for estimation
    for dummy in top.xpath("//p[@newspaper='dummy']"):
        dummy.getparent().remove(dummy)

    # Custom formatting to avoid unnecessary computations
    formatter = NewspaperFormatter(config)
    formatter.top_node = top
    formatter.remove_negativescores_nodes()
    content = formatter.convert_to_html()
    content = str(content).strip()
    content = unescape(content)

    del doc, top, cleaner, extractor, formatter
    etree.clear_error_log()

    return content
def modified_fulltext(parser, language, url):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    url_parsed = urlparse(url)

    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)

    doc = parser
    doc = rm_ads(doc,url_parsed.hostname)
    doc = clean(document_cleaner,doc)
    #doc = document_cleaner.clean(doc)
    doc = calculate_best_node(extractor,doc)
    #doc = extractor.calculate_best_node(doc)
    if doc is not None:
        #doc = extractor.add_siblings(doc)
        doc = post_cleanup(doc)
        #doc = extractor.post_cleanup(doc)
        text, html = get_formatted(doc)
        #text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }
예제 #5
0
파일: testy.py 프로젝트: Xirider/META
def fulltext(html, language='en'):
    """Takes article HTML string input and outputs the fulltext
    Input string is decoded via UnicodeDammit if needed
    """

    config = Configuration()
    config.language = language

    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = WithTagOutputFormatter(config)

    doc = config.get_parser().fromstring(html)
    doc = document_cleaner.clean(doc)

    top_node = extractor.calculate_best_node(doc)

    top_node = extractor.post_cleanup(top_node)
    text, article_html = output_formatter.get_formatted(top_node)
    return text, article_html
예제 #6
0
def modified_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    def calculate_best_node(self, doc):
        top_node = None
        cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)')
        #nodes_to_check = self.nodes_to_check(doc)
        starting_boost = float(1.0)
        #cnt = 0
        #i = 0
        parent_nodes = []
        nodes_with_text = []

        #for node in nodes_to_check:
        for node in cxpath_body_nodes(doc):
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        #for node in nodes_with_text:
        for i,node in enumerate(nodes_with_text):
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                #if cnt >= 0:
                if i >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(
                        bottom_negativescore_nodes - (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            parent_node = self.parser.getParent(node)
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            # Parent of parent node
            parent_parent_node = self.parser.getParent(parent_node)
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            #cnt += 1
            #i += 1

        top_node_score = 0
        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e
        return top_node

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    #doc = document_cleaner.clean(doc)
    top_node = calculate_best_node(extractor,doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }