def entities_from_mc_or_cliff(stories_id): entities = [] # get entities from MediaCloud, or from CLIFF if not in MC cliff_results = cached_story_raw_cliff_results(stories_id)[0]['cliff'] if (cliff_results == 'story is not annotated') or (cliff_results == "story does not exist"): story = mc.story(stories_id, text=True) cliff_results = cliff.parse_text(story['story_text']) # clean up for reporting for org in cliff_results['results']['organizations']: entities.append({ 'type': 'ORGANIZATION', 'name': org['name'], 'frequency': org['count'] }) for person in cliff_results['results']['people']: entities.append({ 'type': 'PERSON', 'name': person['name'], 'frequency': person['count'] }) # places don't have frequency set correctly, so we need to sum them locations = [] place_names = set([place['name'] for place in cliff_results['results']['places']['mentions']]) for place in place_names: loc = { 'type': 'LOCATION', 'name': place, 'frequency': len([p for p in cliff_results['results']['places']['mentions'] if p['name'] == place]) } locations.append(loc) entities += locations # sort smartly unique_entities = sorted(entities, key=itemgetter('frequency'), reverse=True) return unique_entities
def entities_from_mc_or_cliff(stories_id): entities = [] # get entities from MediaCloud, or from CLIFF if not in MC cliff_results = cached_story_raw_cliff_results(stories_id)[0]['cliff'] if (cliff_results == u'"story is not annotated"') or (cliff_results == u"story does not exist"): story = mc.story(stories_id, text=True) cliff_results = cliff.parseText(story['story_text']) # clean up for reporting for org in cliff_results['results']['organizations']: entities.append({ 'type': 'ORGANIZATION', 'name': org['name'], 'frequency': org['count'] }) for person in cliff_results ['results']['people']: entities.append({ 'type': 'PERSON', 'name': person['name'], 'frequency': person['count'] }) # places don't have frequency set correctly, so we need to sum them locations = [] place_names = set([place['name'] for place in cliff_results['results']['places']['mentions']]) for place in place_names: loc = { 'type': 'LOCATION', 'name': place, 'frequency': len([p for p in cliff_results['results']['places']['mentions'] if p['name'] == place]) } locations.append(loc) entities += locations # sort smartly unique_entities = sorted(entities, key=itemgetter('frequency'), reverse=True) return unique_entities
def nyt_themes_from_mc_or_labeller(stories_id): results = cached_story_raw_theme_results(stories_id) if results['nytlabels'] == u'"story is not annotated"': story = mc.story(stories_id, text=True) results = predict_news_labels(story['story_text']) else: results = results['nytlabels'] return results
def nyt_themes_from_mc_or_labeller(stories_id): results = cached_story_raw_theme_results(stories_id) if results['nytlabels'] == 'story is not annotated': story = mc.story(stories_id, text=True) results = predict_news_labels(story['story_text']) else: results = results['nytlabels'] return results
def story_subreddit_shares_csv(stories_id): story = mc.story(stories_id) submissions_by_sub = pushshift.reddit_url_submissions_by_subreddit( story['url']) props = ['name', 'value'] column_names = ['subreddit', 'submissions'] return csv.stream_response(submissions_by_sub, props, 'story-' + str(stories_id) + '-subreddit', column_names=column_names)
def story_subreddit_shares(stories_id): story = mc.story(stories_id) submissions_by_sub = pushshift.reddit_url_submissions_by_subreddit( story['url']) return jsonify({ 'total': sum([r['value'] for r in submissions_by_sub]) if submissions_by_sub is not None else 0, 'subreddits': submissions_by_sub })
def story_top_image(stories_id): story = mc.story(stories_id) # use the tool key so anyone can see these images story_html = apicache.story_raw_1st_download(TOOL_API_KEY, stories_id) article = newspaper.Article(url=story['url']) article.set_html(story_html) article.parse() return jsonify({ 'top': article.top_image, 'all': list(article.images), })
def _cached_story_raw_1st_download(api_key, stories_id): story = mc.story(stories_id, raw_1st_download=True) return story['raw_first_download_file']