def html_to_article(content, language): content = content.strip() if not len(content): return '' config = NewspaperConfig() config.language = language doc = config.get_parser().fromstring(content.strip()) if doc is None: return '' # Split block-level elements with newlines for tag in _BLOCKLEVEL_TAGS: if tag in _MEANINGLESS_TAGS: continue for node in doc.xpath('//{}'.format(tag)): node.append(etree.Element('br')) node.append(etree.Element('br')) # Initial cleanup cleaner = _NewspaperCleaner(config) doc = cleaner.clean(doc) # Best node estimation extractor = NewspaperExtractor(config) top = extractor.calculate_best_node(doc) if top is None: del doc, cleaner, extractor etree.clear_error_log() return '' top = extractor.post_cleanup(top) # Cleanup dummy nodes used for estimation for dummy in top.xpath("//p[@newspaper='dummy']"): dummy.getparent().remove(dummy) # Custom formatting to avoid unnecessary computations formatter = NewspaperFormatter(config) formatter.top_node = top formatter.remove_negativescores_nodes() content = formatter.convert_to_html() content = str(content).strip() content = unescape(content) del doc, top, cleaner, extractor, formatter etree.clear_error_log() return content
def fulltext(html, language='en'): """Takes article HTML string input and outputs the fulltext Input string is decoded via UnicodeDammit if needed """ config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = WithTagOutputFormatter(config) doc = config.get_parser().fromstring(html) doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) top_node = extractor.post_cleanup(top_node) text, article_html = output_formatter.get_formatted(top_node) return text, article_html
def ProcessArticle(urlStr, domain, htmlStr, cursor): config = Configuration() extractor = ContentExtractor(config) clean_doc = config.get_parser().fromstring(htmlStr) title = extractor.get_title(clean_doc) authors = extractor.get_authors(clean_doc) text = fulltext(htmlStr) text_keyws = list(nlp.keywords(text).keys()) title_keyws = list(nlp.keywords(title).keys()) keyws = list(set(title_keyws + text_keyws)) summary_sents = nlp.summarize(title=title, text=text, max_sents=config.MAX_SUMMARY_SENT) summary = '\n'.join(summary_sents) if len(text) == 0: OnArticleProcessError(urlStr) else: StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary, cursor)
class ArticleExtractionPipeline(object): def __init__(self): self.config = Configuration( ) # sets meta config for article and parser self.parser = self.config.get_parser() # parser self.extractor = ContentExtractor( self.config ) # extracts info (author, tags, text, etc.) from parsed article self.doc_cleaner = DocumentCleaner( self.config) # cleans unwanted tags and nodes from DOM self.formatter = OutputFormatter( self.config) # outputs formatted text from parsed xpath nodes # right now basically only works for RT # params: doc is parsed html from self.parser def find_date_from_html(self, doc): # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py candidates = self.parser.getElementsByTag(doc, tag="time") # add more times = [] for candidate in candidates: time_string = candidate.text for indicator in ["Edited", "Updated", "Published"]: if indicator in time_string: # indicator probably followed by "at" or ":", actual time is after that if "at" in time_string: time_string = time_string.split("at", 1)[1] elif ":" in time_string: time_string = time_string.split(":", 1)[1] break time = self.datetime_from_str(time_string) if time: times.append(time) if times: return min(times) else: return None def datetime_from_str(self, datetime_string): try: return date_parser.parse(datetime_string).replace( tzinfo=None ) # otherwise can't compare naive and (timezone) offset-aware times except (ValueError, OverflowError, AttributeError, TypeError): return None # params: doc is parsed html from self.parser # TODO: generalize def get_date(self, url, doc): raw_date = ( self.extractor.get_publishing_date(url, doc) or # telesur, africanews self.extractor.get_meta_content(doc, "meta[name='LastModifiedDate']") or # aljazeera, Sun, 07 January 2018 18:36:49 GMT self.extractor.get_meta_content(doc, "meta[name='Last-Modified']") or # times of india, Jan 9, 2018, 05:18 IST self.extractor.get_meta_content( doc, "meta[property='og:updated_time']") ) # diplomat, "2018-01-05 23:22:46" if raw_date: return self.datetime_from_str(raw_date) else: return self.find_date_from_html(doc) # params: date is datetime object def recent_article(self, date, max_days_elapsed=3): return datetime.datetime.now() - date < datetime.timedelta( days=max_days_elapsed) def process_item(self, item, spider): doc = self.parser.fromstring(item["content"]) item["title"] = self.extractor.get_title(doc) item["description"] = self.extractor.get_meta_description(doc) item["keywords"] = (self.extractor.get_meta_content( doc, "meta[name='news_keywords']") or self.extractor.get_meta_keywords(doc)) item["date"] = self.get_date(item["url"], doc) # drop item if no date if not item["date"] or not self.recent_article( item["date"], max_days_elapsed=7 ): # or not self.recent_article(item["date"]) raise DropItem("Missing or invalid date for: {}".format( item["title"])) # clean: clean_doc = self.doc_cleaner.clean(doc) top_node = self.extractor.post_cleanup( self.extractor.calculate_best_node(clean_doc)) item["content"] = self.formatter.get_formatted(top_node)[ 0] # [1] returns html of article # drop item if article too short if len(item["content"]) < 600: raise DropItem("Not enough text: {}".format(item["title"])) logging.info("ARTICLE TITLE: {}".format(item["title"])) logging.info("\t time: {}".format(item["date"])) return item