async def get_fact(query, args, tokenizer, trex_set, common_vocab, f_out): """ Collect more facts for the TREx-train set from LPAQA """ line = query.strip().split('\t') sub_url, sub, obj_url, obj = line sub_id = get_id_from_url(sub_url) obj_id = get_id_from_url(obj_url) # First, make sure fact is not in TREx test set if (sub_id, obj_id) in trex_set: return # Make sure object is a single token if len(tokenizer.tokenize(obj)) != 1: return # Make sure object is in common vocab subset if obj not in common_vocab: return # Make sure subject is prominent (has a Wikipedia page) try: q_dict = get_entity_dict_from_api(sub_id) q = WikidataItem(q_dict) if not q.get_sitelinks(): return except ValueError: return # Some entities don't have labels so the subject label is the URI if sub_id == sub: return # print('Writing fact: {} - {}', sub, obj) f_out.write( json.dumps({ 'sub_uri': sub_id, 'obj_uri': obj_id, 'sub_label': sub, 'obj_label': obj }) + '\n') # Increment global count await increment_count()
def get_place_from_wikidata(entity_id): parents = set() entity = WikidataItem(get_entity_dict_from_api(entity_id)) claims_groups = entity.get_truthy_claim_groups() place = Place() place.set_gramps_id(entity_id) name = PlaceName() name.set_language('sv') name.set_value(entity.get_label('sv')) place.set_name(name=name) place.set_title(entity.get_label('sv')) for lang in ['sv', 'en', 'de', 'fi', 'no', 'nn', 'da', 'se']: wiki_name = entity.get_label(lang) if len(wiki_name): place_name = PlaceName() place_name.set_language(lang) place_name.set_value(wiki_name) place.add_alternative_name(name=place_name) for alias in entity.get_aliases(lang): alt_name = PlaceName() alt_name.set_language(lang) alt_name.set_value(alias) place.add_alternative_name(name=alt_name) for link in entity.get_sitelinks(lang).values(): wikipedia_url = Url() wikipedia_url.set_path(link['url']) wikipedia_url.set_type('Wikipedia entry') wikipedia_url.set_description('Wikipedia %s:%s' % (link["title"], link["site"])) place.add_url(wikipedia_url) # Instance of -> PlaceType if PROPERTY_INSTANCE_OF in claims_groups: for claim in claims_groups[PROPERTY_INSTANCE_OF]: instance_of = claim.mainsnak.datavalue.value['id'] if ITEM_PARISH == instance_of: place.set_type(PlaceType.PARISH) elif ITEM_SOCKEN == instance_of: place.set_type(PlaceType.PARISH) elif ITEM_ISLAND == instance_of: place.set_type(PlaceType.UNKNOWN) # No islands in Gramps elif ITEM_MUNICIPALITY_OF_SWEDEN == instance_of: place.set_type(PlaceType.MUNICIPALITY) elif ITEM_MUNICIPALITY == instance_of: place.set_type(PlaceType.MUNICIPALITY) elif ITEM_COUNTRY == instance_of: place.set_type(PlaceType.COUNTRY) elif ITEM_SOVEREIGN_STATE == instance_of: place.set_type(PlaceType.COUNTRY) elif ITEM_STATE_OF_US == instance_of: place.set_type(PlaceType.STATE) elif ITEM_FEDERAL_STATE == instance_of: place.set_type(PlaceType.STATE) elif ITEM_COUNTY == instance_of: place.set_type(PlaceType.COUNTY) elif ITEM_COUNTY_OF_SWEDEN == instance_of: place.set_type(PlaceType.COUNTY) elif ITEM_FORMER_COUNTY_OF_SWEDEN == instance_of: place.set_type(PlaceType.COUNTY) elif ITEM_PROVINCE_OF_SWEDEN == instance_of: place.set_type(PlaceType.PROVINCE) elif ITEM_PROVINCE == instance_of: place.set_type(PlaceType.PROVINCE) elif ITEM_ADM_REGION == instance_of: place.set_type(PlaceType.REGION) elif ITEM_NEIGHBORHOOD == instance_of: place.set_type(PlaceType.NEIGHBORHOOD) elif ITEM_DISTRICT == instance_of: place.set_type(PlaceType.DISTRICT) elif ITEM_BOROUGH == instance_of: place.set_type(PlaceType.BOROUGH) elif ITEM_TOWN == instance_of: place.set_type(PlaceType.TOWN) elif ITEM_LARGE_VILLAGE == instance_of: place.set_type(PlaceType.VILLAGE) elif ITEM_VILLAGE == instance_of: place.set_type(PlaceType.VILLAGE) elif ITEM_URBAN_AREA_IN_SWEDEN == instance_of: place.set_type(PlaceType.VILLAGE) elif ITEM_HAMLET == instance_of: place.set_type(PlaceType.HAMLET) elif ITEM_FARM == instance_of: place.set_type(PlaceType.FARM) elif ITEM_BUILDING == instance_of: place.set_type(PlaceType.BUILDING) if PROPERTY_COORDINATE_LOCATION in claims_groups: for claim in claims_groups[PROPERTY_COORDINATE_LOCATION]: datavalue = claim.mainsnak.datavalue place.set_latitude(str(datavalue.value['latitude'])) place.set_longitude(str(datavalue.value['longitude'])) extract_located_in(claims_groups, PROPERTY_LOCATED_IN_PRESENT, parents) extract_located_in(claims_groups, PROPERTY_LOCATED_IN_ADM, parents) extract_located_in(claims_groups, PROPERTY_LOCATED, parents) return place, parents