def content_categories(content): if not 'categories' in content: content['categories'] = classify_text(content['text']) _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': { 'categories': content['categories'] }}) return content['categories']
def content_entities(content): if not 'entities' in content: content['entities'] = get_entities(content['text']) _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': { 'entities': content['entities'] }}) return content['entities']
def content_keywords(content): if not 'keywords' in content: content['keywords'] = [ x for x in get_keywords(content['text']) if x['count'] > 2 ] _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': { 'keywords': content['keywords'] }}) return content['keywords']
def content_stakeholders(content): if not 'stakeholders' in content: entities = content_entities(content) kwargs = {'credentials': get_twitter_credentials()} stakeholder_list = find_stakeholder_twitter_users(entities, **kwargs) content['stakeholders'] = stakeholder_list _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': { 'stakeholders': content['stakeholders'] }}) return content['stakeholders']
def all_the_content(content, article_database_ref, reload_pundits=False): """ :param content: this is the mongo object containing our content up to now :param reload_pundits: if true, pundits are re-scraped every time :return: returns keywords, entities, and newpundits, as well as storing them in the mongo object for the article """ reload_pundits = True article = newspaper.Article(content['url']) article.download() article.parse() article.nlp() print "HERE ARE THE NEWSPAPER KEYWORDS", article.keywords content['keywords'] = "" content['entities'] = "" # if not 'keywords' in content: # content['keywords'] = [x for x in get_keywords(content['text']) # if x['count'] > 2] # _content.update({'_id': bson.ObjectId(content['id'])}, # {'$set': {'keywords': content['keywords']}}) # # if not 'entities' in content: # content['entities'] = get_entities(content['text']) # _content.update({'_id': bson.ObjectId(content['id'])}, # {'$set': {'entities': content['entities']}}) if not 'newpundits' in content or reload_pundits: content['newpundits'] = [] dupe_list = [] snippets, ratios = pundits.keyword_match(article_database_ref, article.keywords) content['newpundits'] = snippets _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': { 'newpundits': content['newpundits'] }}) if not len(content['newpundits']): print "nothing to see here!" failed_snippet = {} failed_snippet['name'] = "#shambles" failed_snippet['text'] = "we can't seem to find anything." content['newpundits'] = [[failed_snippet]] else: print "HERE ARE NEW PUNDITS:", content['newpundits'] return content['keywords'], content['entities'], content['newpundits']
def all_the_content(content, article_database_ref, reload_pundits=False): """ :param content: this is the mongo object containing our content up to now :param reload_pundits: if true, pundits are re-scraped every time :return: returns keywords, entities, and newpundits, as well as storing them in the mongo object for the article """ reload_pundits = True article = newspaper.Article(content['url']) article.download() article.parse() article.nlp() print "HERE ARE THE NEWSPAPER KEYWORDS", article.keywords content['keywords']="" content['entities']="" # if not 'keywords' in content: # content['keywords'] = [x for x in get_keywords(content['text']) # if x['count'] > 2] # _content.update({'_id': bson.ObjectId(content['id'])}, # {'$set': {'keywords': content['keywords']}}) # # if not 'entities' in content: # content['entities'] = get_entities(content['text']) # _content.update({'_id': bson.ObjectId(content['id'])}, # {'$set': {'entities': content['entities']}}) if not 'newpundits' in content or reload_pundits: content['newpundits'] = [] dupe_list = [] snippets, ratios = pundits.keyword_match(article_database_ref, article.keywords) content['newpundits'] = snippets _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': {'newpundits': content['newpundits']}}) if not len(content['newpundits']): print "nothing to see here!" failed_snippet = {} failed_snippet['name'] = "#shambles" failed_snippet['text'] = "we can't seem to find anything." content['newpundits'] = [[failed_snippet]] else: print "HERE ARE NEW PUNDITS:", content['newpundits'] return content['keywords'], content['entities'], content['newpundits']