def extract_meta(article): pc_article = pubcrawler.Article(article) return ({ 'meta': { 'article-ids': pc_article.pub_ids(), 'article-type': pc_article.article_type(), # 'pub-dates': pc_article.pub_dates() # Need to fix stuff with dates in Mongo 'keywords': pc_article.keywords() } })
def extract_geonames(article, store_all=False): pc_article = pubcrawler.Article(article) anno_doc = AnnoDoc(pc_article.body) geoname_tier = anno_doc.require_tiers('geonames', via=geoname_annotator) geoname_dicts = {} for span in geoname_tier: geoname = span.metadata['geoname'] result = {} if store_all: for key in GEONAME_ATTRS + ['score']: result[key] = geoname[key] else: result['geonameid'] = geoname['geonameid'] geoname_dicts[result['geonameid']] = result return ({ 'index.geonames': 1, 'geonames': { 'culled': list(geoname_dicts.values()) } })
def extract_disease_ontology_keywords(article): pc_article = pubcrawler.Article(article) anno_doc = AnnoDoc(pc_article.body) resolved_keyword_tier = anno_doc.require_tiers('resolved_keywords', via=keyword_annotator) disease_ontology_keyword_dict = {} for span in resolved_keyword_tier: for resolution in span.metadata['resolutions']: entity = resolution['entity'] if entity['type'] == 'disease': disease_ontology_keyword_dict[entity['id']] = { "keyword": entity['label'], "uri": entity['id'] } return ({ 'index.keywords': 1, 'keywords': { 'disease-ontology': list(disease_ontology_keyword_dict.values()) } })
def extract_disease_ontology_keywords(article): pc_article = pubcrawler.Article(article) anno_doc = AnnoDoc(pc_article.body) anno_doc.add_tier(keyword_annotator) infectious_diseases = [(disease.text, resolve_keyword(disease.text)) for disease in anno_doc.tiers['keywords'].spans] # disease_ontology_keywords = None if len(infectious_diseases) == 0 else annotated_keywords_to_dict_list(infectious_diseases) if len(infectious_diseases) == 0: disease_ontology_keywords = None else: seen_keys = [] disease_ontology_keywords = [] for keyword_entity in infectious_diseases: keyword, uri = keyword_entity if keyword in seen_keys: continue else: seen_keys.append(keyword) keyword_dict = { "keyword": keyword, "uri": uri[0].entity.toPython() } keyword_list.append(keyword_dict) return ({'keywords': {'disease-ontology': disease_ontology_keywords}})
def extract_geonames(article): pc_article = pubcrawler.Article(article) anno_doc = AnnoDoc(pc_article.body) candidate_locations = geoname_annotator.get_candidate_geonames(anno_doc) # Generate and score features features = geoname_annotator.extract_features(candidate_locations) feature_weights = dict( population_score=2.0, synonymity=1.0, num_spans_score=0.4, short_span_score=(-5), NEs_contained=1.2, # Distinctness is probably more effective when combined # with other features distinctness=1.0, max_span_score=1.0, # close_locations=0.8, # closest_location=0.8, # containment_level=0.8, cannonical_name_used=0.5, feature_code_score=0.6, ) for location, feature in zip(candidate_locations, features): location['score'] = feature.score(feature_weights) culled_locations = [ location for location in candidate_locations if location['score'] > 50 ] geo_spans = [] for location in culled_locations: # Copy the dict so we don't need to return a custom class. location = dict(location) for span in location['spans']: # TODO: Adjust scores to give geospans that exactly match # a corresponding geoname a bonus. geo_span = GeoSpan(span.start, span.end, anno_doc, location) geo_spans.append(geo_span) culled_geospans = geoname_annotator.cull_geospans(geo_spans) # props_to_omit = ['spans', 'alternatenames', 'alternateLocations'] # for geospan in culled_geospans: # # The while loop removes the properties from the parentLocations. # # There will probably only be one parent location. # cur_location = geospan.geoname # while True: # if all([ # prop not in cur_location # for prop in props_to_omit # ]): # break # for prop in props_to_omit: # cur_location.pop(prop) # if 'parentLocation' in cur_location: # cur_location = cur_location['parentLocation'] # else: # break props_to_omit = ['spans', 'alternateLocations'] # Get candidate geonameids and feature vectors all_geonames = [] for location, feature in zip(candidate_locations, features): geoname_dict = location for prop in props_to_omit: geoname_dict.pop(prop, None) # geoname_dict['geonameid'] = location['geonameid'] geoname_dict['annie_features'] = feature.to_dict() all_geonames.append(geoname_dict) culled_geonames = [] for geospan in culled_geospans: geoname = geospan.geoname for prop in props_to_omit: geoname.pop(prop, None) culled_geonames.append(geospan.to_dict()) return ({'geonames': {'all': all_geonames, 'culled': culled_geonames}})