def process_article_summaries(db, override=False): col = db.Article articles = col.find() skipped = 0 summarized = 0 for article in articles: if not override and 'summary' in article and len(article['summary']) > 0: print("Already found summary for {}, skipping ...".format(article['headline']), file=sys.stderr) skipped += 1 continue print("Processing {} ...".format(article['headline'])) if 'body' not in article or article['body'] == "": print("Body not found for {}, skipping ...".format(article['headline']), file=sys.stderr) skipped += 1 continue body = sanitize(article['body']) summary = summarize(article['headline'], body) col.update({ '_id': article['_id'] }, { '$set': { 'summary': summary } }) summarized += 1 return { 'skipped': skipped, 'summarized': summarized }
def process(articles, query_db=True, update_all=False): from spacyparser import SpacyParser parser = SpacyParser() summar = Summarizer(parser) num_added = 0 num_updated = 0 num_invalid_body = 0 for article in articles: article_id = article["article_id"] article_headline = article["headline"] article_url = article["url"] body = article.get("body", None) review = article if query_db: review = mongo.db.SummaryReview.find_one({"article_id": article_id}) if not body: art = mongo.db.Article.find_one({"article_id": article_id}) if not art or not art["body"]: print("Article {} does not have a body, skipping".format(article_id)) num_invalid_body += 1 continue body = art["body"] # sanitize step body = sanitize(body) sentences = parser.sentences(body) summary = summarize(article["headline"], body, count=3, summarizer=summar) bot_indices = summary_indices(sentences, summary) if review is None: mongo.db.SummaryReview.insert( { "article_id": article_id, "headline": article_headline, "url": article_url, "sentences": sentences, "summary": {"Bot": bot_indices}, } ) num_added += 1 continue # remove all votes and flags if new sentences dont match old ones updated = False if "sentences" not in review or len(review["sentences"]) != len(sentences): updated = True else: for cur_sentence, new_sentence in zip(review["sentences"], sentences): if cur_sentence != new_sentence: updated = True break if updated or update_all: review["invalid"] = [] review["summary"] = {"Bot": bot_indices} review["sentences"] = sentences review["updated_at"] = datetime.utcnow() review["tokens_valid"] = False mongo.db.SummaryReview.update({"_id": review["_id"]}, review) num_updated += 1 else: if "summary" in review: review["summary"]["Bot"] = bot_indices else: review["summary"] = {"Bot": bot_indices} mongo.db.SummaryReview.update({"_id": review["_id"]}, review) print("-" * 80) print("Articles fetched:\n") print("\tNumber added: {}".format(num_added)) print("\tNumber updated: {}".format(num_updated)) print("\tNumber invalid body: {}".format(num_invalid_body))