示例#1
0
def process_article_summaries(db, override=False):
    col = db.Article
    articles = col.find()
    skipped = 0
    summarized = 0
    for article in articles:
        if not override and 'summary' in article and len(article['summary']) > 0:
            print("Already found summary for {}, skipping ...".format(article['headline']), file=sys.stderr)
            skipped += 1
            continue

        print("Processing {} ...".format(article['headline']))

        if 'body' not in article or article['body'] == "":
            print("Body not found for {}, skipping ...".format(article['headline']), file=sys.stderr)
            skipped += 1
            continue

        body = sanitize(article['body'])
        summary = summarize(article['headline'], body)
        col.update({ '_id': article['_id'] }, { '$set': { 'summary': summary } })
        summarized += 1

    return { 'skipped': skipped, 'summarized': summarized }
示例#2
0
def process(articles, query_db=True, update_all=False):
    from spacyparser import SpacyParser

    parser = SpacyParser()
    summar = Summarizer(parser)

    num_added = 0
    num_updated = 0
    num_invalid_body = 0
    for article in articles:
        article_id = article["article_id"]
        article_headline = article["headline"]
        article_url = article["url"]
        body = article.get("body", None)
        review = article

        if query_db:
            review = mongo.db.SummaryReview.find_one({"article_id": article_id})

        if not body:
            art = mongo.db.Article.find_one({"article_id": article_id})
            if not art or not art["body"]:
                print("Article {} does not have a body, skipping".format(article_id))
                num_invalid_body += 1
                continue

            body = art["body"]

        # sanitize step
        body = sanitize(body)
        sentences = parser.sentences(body)
        summary = summarize(article["headline"], body, count=3, summarizer=summar)
        bot_indices = summary_indices(sentences, summary)

        if review is None:
            mongo.db.SummaryReview.insert(
                {
                    "article_id": article_id,
                    "headline": article_headline,
                    "url": article_url,
                    "sentences": sentences,
                    "summary": {"Bot": bot_indices},
                }
            )

            num_added += 1
            continue

        # remove all votes and flags if new sentences dont match old ones
        updated = False
        if "sentences" not in review or len(review["sentences"]) != len(sentences):
            updated = True
        else:
            for cur_sentence, new_sentence in zip(review["sentences"], sentences):
                if cur_sentence != new_sentence:
                    updated = True
                    break

        if updated or update_all:
            review["invalid"] = []
            review["summary"] = {"Bot": bot_indices}
            review["sentences"] = sentences
            review["updated_at"] = datetime.utcnow()
            review["tokens_valid"] = False
            mongo.db.SummaryReview.update({"_id": review["_id"]}, review)
            num_updated += 1
        else:
            if "summary" in review:
                review["summary"]["Bot"] = bot_indices
            else:
                review["summary"] = {"Bot": bot_indices}
            mongo.db.SummaryReview.update({"_id": review["_id"]}, review)

    print("-" * 80)
    print("Articles fetched:\n")
    print("\tNumber added: {}".format(num_added))
    print("\tNumber updated: {}".format(num_updated))
    print("\tNumber invalid body: {}".format(num_invalid_body))