def newspaper_fulltext2(parser, language, url): ''' This is a faster version of the function that uses some internal newspaper3k functions so that the lxml parse tree doesn't need to be recreated. Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = document_cleaner.clean(doc) doc = extractor.calculate_best_node(doc) if doc is not None: doc = extractor.post_cleanup(doc) text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value': { 'text': text, 'html': html, }, 'pattern': 'newspaper3k', }
def newspaper_fulltext(parser, language): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) if top_node is not None: top_node = extractor.post_cleanup(top_node) text, html = output_formatter.get_formatted(top_node) else: text = None html = None return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'newspaper3k', }
def __init__(self): self.config = Configuration( ) # sets meta config for article and parser self.parser = self.config.get_parser() # parser self.extractor = ContentExtractor( self.config ) # extracts info (author, tags, text, etc.) from parsed article self.doc_cleaner = DocumentCleaner( self.config) # cleans unwanted tags and nodes from DOM self.formatter = OutputFormatter( self.config) # outputs formatted text from parsed xpath nodes
def modified_fulltext(parser, language, url): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' url_parsed = urlparse(url) from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = rm_ads(doc,url_parsed.hostname) doc = clean(document_cleaner,doc) #doc = document_cleaner.clean(doc) doc = calculate_best_node(extractor,doc) #doc = extractor.calculate_best_node(doc) if doc is not None: #doc = extractor.add_siblings(doc) doc = post_cleanup(doc) #doc = extractor.post_cleanup(doc) text, html = get_formatted(doc) #text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }
def fulltext(html, language='en'): """Takes article HTML string input and outputs the fulltext Input string is decoded via UnicodeDammit if needed """ config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = WithTagOutputFormatter(config) doc = config.get_parser().fromstring(html) doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) top_node = extractor.post_cleanup(top_node) text, article_html = output_formatter.get_formatted(top_node) return text, article_html
def ProcessArticle(urlStr, domain, htmlStr, cursor): config = Configuration() extractor = ContentExtractor(config) clean_doc = config.get_parser().fromstring(htmlStr) title = extractor.get_title(clean_doc) authors = extractor.get_authors(clean_doc) text = fulltext(htmlStr) text_keyws = list(nlp.keywords(text).keys()) title_keyws = list(nlp.keywords(title).keys()) keyws = list(set(title_keyws + text_keyws)) summary_sents = nlp.summarize(title=title, text=text, max_sents=config.MAX_SUMMARY_SENT) summary = '\n'.join(summary_sents) if len(text) == 0: OnArticleProcessError(urlStr) else: StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary, cursor)
def get_data_from_html(html): result = {} parsed_html = Parser.fromstring(html) config = Configuration() extractor = ContentExtractor(config) formatter = OutputFormatter(config) cleaner = DocumentCleaner(config) result['title'] = extractor.get_title(parsed_html) publishing_date = extractor.get_publishing_date('', parsed_html) if publishing_date is None: publishing_date = datetime.datetime.now() result['published_at'] = publishing_date.isoformat() cleaned_html = cleaner.clean(parsed_html) top_node = extractor.calculate_best_node(cleaned_html) top_node = extractor.post_cleanup(top_node) result['content'], _ = formatter.get_formatted(top_node) return result
def modified_fulltext(parser, language): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter def calculate_best_node(self, doc): top_node = None cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)') #nodes_to_check = self.nodes_to_check(doc) starting_boost = float(1.0) #cnt = 0 #i = 0 parent_nodes = [] nodes_with_text = [] #for node in nodes_to_check: for node in cxpath_body_nodes(doc): text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) nodes_number = len(nodes_with_text) negative_scoring = 0 bottom_negativescore_nodes = float(nodes_number) * 0.25 #for node in nodes_with_text: for i,node in enumerate(nodes_with_text): boost_score = float(0) # boost if self.is_boostable(node): #if cnt >= 0: if i >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 # nodes_number if nodes_number > 15: if (nodes_number - i) <= bottom_negativescore_nodes: booster = float( bottom_negativescore_nodes - (nodes_number - i)) boost_score = float(-pow(booster, float(2))) negscore = abs(boost_score) + negative_scoring if negscore > 40: boost_score = float(5) text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) parent_node = self.parser.getParent(node) self.update_score(parent_node, upscore) self.update_node_count(parent_node, 1) if parent_node not in parent_nodes: parent_nodes.append(parent_node) # Parent of parent node parent_parent_node = self.parser.getParent(parent_node) if parent_parent_node is not None: self.update_node_count(parent_parent_node, 1) self.update_score(parent_parent_node, upscore / 2) if parent_parent_node not in parent_nodes: parent_nodes.append(parent_parent_node) #cnt += 1 #i += 1 top_node_score = 0 for e in parent_nodes: score = self.get_score(e) if score > top_node_score: top_node = e top_node_score = score if top_node is None: top_node = e return top_node config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser #doc = document_cleaner.clean(doc) top_node = calculate_best_node(extractor,doc) if top_node is not None: top_node = extractor.post_cleanup(top_node) text, html = output_formatter.get_formatted(top_node) else: text = None html = None return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }