def test_get_claim_1(self) -> None: """Assert correct behavior.""" q42_dict = _load_item_dict(typedefs.ItemId("Q42")) given_name_douglas = "Q463035" given_name_noel = "Q19688263" item = WikidataItem(q42_dict) claim_group = item.get_claim_group(typedefs.PropertyId("P735")) assert len(claim_group) == 2 given_names = set( [cl.mainsnak.datavalue.value["id"] for cl in claim_group]) assert given_names == set([given_name_douglas, given_name_noel])
def has_occupation_politician(item: WikidataItem, truthy: bool = True) -> bool: """Return True if the Wikidata Item has occupation politician.""" if truthy: claim_group = item.get_truthy_claim_group(P_OCCUPATION) else: claim_group = item.get_claim_group(P_OCCUPATION) occupation_qids = [ claim.mainsnak.datavalue.value["id"] for claim in claim_group if claim.mainsnak.snaktype == "value" ] return Q_POLITICIAN in occupation_qids
def test_get_truthy_claim_1(self) -> None: """Assert correct behavior with one preferred and one normal.""" q42_dict = _load_item_dict(typedefs.ItemId("Q42")) given_name_douglas = "Q463035" item = WikidataItem(q42_dict) truthy_claim_group = item.get_truthy_claim_group( typedefs.PropertyId("P735")) assert len(truthy_claim_group) == 1 claim = truthy_claim_group[0] mainsnak = claim.mainsnak datavalue = mainsnak.datavalue assert isinstance(datavalue, WikibaseEntityId) qid = datavalue.value["id"] assert qid == given_name_douglas
def get_matched_wkd_entities(wktitles: List[str]) -> Dict[str, WikidataItem]: def is_matched(q: WikidataItem) -> bool: # 確認是否有中文名 if q.get_label("zh") == "": print(f'Skip, no zh label: {q.get_enwiki_title()}') return False # entity不能是人 cg = q.get_claim_group("P31") # P31:instance_of instanceof = [c.mainsnak.datavalue.value['id'] for c in cg] if "Q5" in instanceof: # Q5:human print(f'Skip, is a person: {q.get_enwiki_title()}') return False # entity不能有位置claim cg = q.get_claim_group("P625") # P625:coordinate_location if cg.property_id is not None: print(f'Skip, has coordinate location: {q.get_enwiki_title()}') return False return True matched = dict() for wktitle in wktitles: try: doc = es.WikiData.get(id=wktitle) q = WikidataItem(json.loads(doc.json)) if is_matched(q): matched[wktitle] = q except elasticsearch.NotFoundError: print(f"Not found wikidata: {wktitle}") return matched
async def get_fact(query, args, tokenizer, trex_set, common_vocab, f_out): """ Collect more facts for the TREx-train set from LPAQA """ line = query.strip().split('\t') sub_url, sub, obj_url, obj = line sub_id = get_id_from_url(sub_url) obj_id = get_id_from_url(obj_url) # First, make sure fact is not in TREx test set if (sub_id, obj_id) in trex_set: return # Make sure object is a single token if len(tokenizer.tokenize(obj)) != 1: return # Make sure object is in common vocab subset if obj not in common_vocab: return # Make sure subject is prominent (has a Wikipedia page) try: q_dict = get_entity_dict_from_api(sub_id) q = WikidataItem(q_dict) if not q.get_sitelinks(): return except ValueError: return # Some entities don't have labels so the subject label is the URI if sub_id == sub: return # print('Writing fact: {} - {}', sub, obj) f_out.write( json.dumps({ 'sub_uri': sub_id, 'obj_uri': obj_id, 'sub_label': sub, 'obj_label': obj }) + '\n') # Increment global count await increment_count()
def process(ii, entity_dict): if args.start > ii: return [] if entity_dict["type"] == "item": return self.handle_entity(WikidataItem(entity_dict)) elif entity_dict["type"] == "property": return self.handle_property(WikidataProperty(entity_dict)) else: return []
def is_matched(q: WikidataItem) -> bool: # 確認是否有中文名 if q.get_label("zh") == "": print(f'Skip, no zh label: {q.get_enwiki_title()}') return False # entity不能是人 cg = q.get_claim_group("P31") # P31:instance_of instanceof = [c.mainsnak.datavalue.value['id'] for c in cg] if "Q5" in instanceof: # Q5:human print(f'Skip, is a person: {q.get_enwiki_title()}') return False # entity不能有位置claim cg = q.get_claim_group("P625") # P625:coordinate_location if cg.property_id is not None: print(f'Skip, has coordinate location: {q.get_enwiki_title()}') return False return True
def import_from_wkddump(dump_path: str, skip: int = 0, first_n: int = None) -> None: for i, entity_dict in enumerate(WikidataJsonDump(dump_path)): if first_n is not None and i > first_n: print(f"Early stop at {first_n}") break if i < skip: continue if i % 10000 == 0: print(i) if entity_dict["type"] == "item": e = WikidataItem(entity_dict) doc = WikiData( en_title=e.get_enwiki_title(), wkd_id=e.entity_id, json=json.dumps(e._entity_dict), ) doc.save()
def test_get_aliases_1(self) -> None: """Assert correct behavior in get_aliases method.""" q42_dict = _load_item_dict(typedefs.ItemId("Q42")) en_aliases = [el["value"] for el in q42_dict["aliases"][EN]] de_aliases = [el["value"] for el in q42_dict["aliases"][DE]] item = WikidataItem(q42_dict) assert item.get_aliases() == en_aliases assert item.get_aliases(lang=EN) == en_aliases assert item.get_aliases(lang=DE) == de_aliases assert item.get_aliases(lang=NO) == [] p279_dict = _load_property_dict(typedefs.PropertyId("P279")) en_aliases = [el["value"] for el in p279_dict["aliases"][EN]] de_aliases = [el["value"] for el in p279_dict["aliases"][DE]] prop = WikidataProperty(p279_dict) assert prop.get_aliases() == en_aliases assert prop.get_aliases(lang=EN) == en_aliases assert prop.get_aliases(lang=DE) == de_aliases assert prop.get_aliases(lang=NO) == []
def test_get_description_1(self) -> None: """Assert correct behavior in get_description method.""" q42_dict = _load_item_dict(typedefs.ItemId("Q42")) en_description = q42_dict["descriptions"][EN]["value"] de_description = q42_dict["descriptions"][DE]["value"] item = WikidataItem(q42_dict) assert item.get_description() == en_description assert item.get_description(lang=EN) == en_description assert item.get_description(lang=DE) == de_description assert item.get_description(lang=NO) == "" p279_dict = _load_property_dict(typedefs.PropertyId("P279")) en_description = p279_dict["descriptions"][EN]["value"] de_description = p279_dict["descriptions"][DE]["value"] prop = WikidataProperty(p279_dict) assert prop.get_description() == en_description assert prop.get_description(lang=EN) == en_description assert prop.get_description(lang=DE) == de_description assert prop.get_description(lang=NO) == ""
def test_get_label_1(self) -> None: """Assert correct behavior in get_label method.""" q42_dict = _load_item_dict(types.ItemId("Q42")) en_label = q42_dict["labels"][EN]["value"] de_label = q42_dict["labels"][DE]["value"] item = WikidataItem(q42_dict) assert item.get_label() == en_label assert item.get_label(lang=EN) == en_label assert item.get_label(lang=DE) == de_label assert item.get_label(lang=NO) == "" p279_dict = _load_property_dict(types.PropertyId("P279")) en_label = p279_dict["labels"][EN]["value"] de_label = p279_dict["labels"][DE]["value"] prop = WikidataProperty(p279_dict) assert prop.get_label() == en_label assert prop.get_label(lang=EN) == en_label assert prop.get_label(lang=DE) == de_label assert prop.get_label(lang=NO) == ""
def main(search_term): wikipedia = MediaWiki(lang='pap', user_agent='code-for-nl-pap-parser') wikidata = MediaWiki(url='https://www.wikidata.org/w/api.php', user_agent='code-for-nl-pap-parser') search_result = wikipedia.search(search_term, results=4) for result_item in search_result: page = wikipedia.page(result_item) print( 'I found page \'%s\' for term \'%s\'' % (result_item, search_term), 'with categories', '/'.join(page.categories), 'https://pap.wikipedia.org/wiki/' + urllib.parse.quote(result_item)) # print(page.images) # Now I am going to search this one on wikidata, this will return a code. like Q215887 search_data = wikidata.search(result_item, results=1) for data_item in search_data: Q_CODE = data_item print(result_item, 'is known on wikidata with the code', Q_CODE, 'https://www.wikidata.org/wiki/' + Q_CODE) # Now try the qwikidata interface entity = get_entity_dict_from_api(Q_CODE) q = WikidataItem(entity) pap_data_label = q.get_label(lang='pap') nl_data_label = q.get_label(lang='nl') if pap_data_label and nl_data_label: # First get the page. Read the images found data_page = wikidata.page(result_item) # print(data_page.images) print(pap_data_label, 'is called', nl_data_label, 'in dutch') elif pap_data_label and not nl_data_label: print(pap_data_label, 'has no entry for dutch!') elif not pap_data_label and nl_data_label: print(Q_CODE, 'does not match papiamentu entry') elif not pap_data_label and not nl_data_label: print(pap_data_label, 'has no entry for dutch or papiamentu!')
def fill(X): name = X['name'] if name is NaN: if 'official_name' in X['tags']: name = X['tags']['official_name'] elif 'operator' in X['tags']: name = X['tags']['operator'] elif 'brand:wikidata' in X['tags']: wikidata = X['tags']['brand:wikidata'] q_dict = get_entity_dict_from_api(wikidata) name = WikidataItem(q_dict).get_label() elif 'brand:wikipedia' in X['tags']: wikipedia = X['tags']['brand:wikipedia'] name = wikipedia[3:] return name
if claim.mainsnak.snaktype == "value" ] return Q_POLITICIAN in occupation_qids # create an instance of WikidataJsonDump wjd_dump_path = "wikidata-20190401-all.json.bz2" wjd = WikidataJsonDump(wjd_dump_path) # create an iterable of WikidataItem representing politicians politicians = [] t1 = time.time() for ii, entity_dict in enumerate(wjd): if entity_dict["type"] == "item": entity = WikidataItem(entity_dict) if has_occupation_politician(entity): politicians.append(entity) if ii % 1000 == 0: t2 = time.time() dt = t2 - t1 print( "found {} politicians among {} entities [entities/s: {:.2f}]".format( len(politicians), ii, ii / dt ) ) if ii > 10000: break
for monument in monument_list: count_mon += 1 print(count_mon) if count_mon % 4 == 0: print("HELLO") partition_num = str(count_mon / 4).split('.')[0] with open( '../../Downloads/hi_monument_english_labels' + partition_num + '.json', 'w') as fout: json.dump(complete_final_monument_list, fout) complete_final_monument_list = [] print("Checkpoint %d reached, JSON dumps saved |" % (count_mon / 4)) for key, val in monument.items(): if key == "title": monument['title'] = WikidataItem(get_entity_dict_from_api( str(val))).get_label() elif key == "id": monument['id'] = WikidataItem(get_entity_dict_from_api( str(val))).get_label() for key in monument['claims'].keys(): if (key == "P727"): continue else: property_list.append( WikidataProperty(get_entity_dict_from_api(key)).get_label()) final_monument_list = dict( zip(property_list, list(monument['claims'].values()))) monument['claims'].clear() monument['claims'].update(final_monument_list) property_list = []
def get_place_from_wikidata(entity_id): parents = set() entity = WikidataItem(get_entity_dict_from_api(entity_id)) claims_groups = entity.get_truthy_claim_groups() place = Place() place.set_gramps_id(entity_id) name = PlaceName() name.set_language('sv') name.set_value(entity.get_label('sv')) place.set_name(name=name) place.set_title(entity.get_label('sv')) for lang in ['sv', 'en', 'de', 'fi', 'no', 'nn', 'da', 'se']: wiki_name = entity.get_label(lang) if len(wiki_name): place_name = PlaceName() place_name.set_language(lang) place_name.set_value(wiki_name) place.add_alternative_name(name=place_name) for alias in entity.get_aliases(lang): alt_name = PlaceName() alt_name.set_language(lang) alt_name.set_value(alias) place.add_alternative_name(name=alt_name) for link in entity.get_sitelinks(lang).values(): wikipedia_url = Url() wikipedia_url.set_path(link['url']) wikipedia_url.set_type('Wikipedia entry') wikipedia_url.set_description('Wikipedia %s:%s' % (link["title"], link["site"])) place.add_url(wikipedia_url) # Instance of -> PlaceType if PROPERTY_INSTANCE_OF in claims_groups: for claim in claims_groups[PROPERTY_INSTANCE_OF]: instance_of = claim.mainsnak.datavalue.value['id'] if ITEM_PARISH == instance_of: place.set_type(PlaceType.PARISH) elif ITEM_SOCKEN == instance_of: place.set_type(PlaceType.PARISH) elif ITEM_ISLAND == instance_of: place.set_type(PlaceType.UNKNOWN) # No islands in Gramps elif ITEM_MUNICIPALITY_OF_SWEDEN == instance_of: place.set_type(PlaceType.MUNICIPALITY) elif ITEM_MUNICIPALITY == instance_of: place.set_type(PlaceType.MUNICIPALITY) elif ITEM_COUNTRY == instance_of: place.set_type(PlaceType.COUNTRY) elif ITEM_SOVEREIGN_STATE == instance_of: place.set_type(PlaceType.COUNTRY) elif ITEM_STATE_OF_US == instance_of: place.set_type(PlaceType.STATE) elif ITEM_FEDERAL_STATE == instance_of: place.set_type(PlaceType.STATE) elif ITEM_COUNTY == instance_of: place.set_type(PlaceType.COUNTY) elif ITEM_COUNTY_OF_SWEDEN == instance_of: place.set_type(PlaceType.COUNTY) elif ITEM_FORMER_COUNTY_OF_SWEDEN == instance_of: place.set_type(PlaceType.COUNTY) elif ITEM_PROVINCE_OF_SWEDEN == instance_of: place.set_type(PlaceType.PROVINCE) elif ITEM_PROVINCE == instance_of: place.set_type(PlaceType.PROVINCE) elif ITEM_ADM_REGION == instance_of: place.set_type(PlaceType.REGION) elif ITEM_NEIGHBORHOOD == instance_of: place.set_type(PlaceType.NEIGHBORHOOD) elif ITEM_DISTRICT == instance_of: place.set_type(PlaceType.DISTRICT) elif ITEM_BOROUGH == instance_of: place.set_type(PlaceType.BOROUGH) elif ITEM_TOWN == instance_of: place.set_type(PlaceType.TOWN) elif ITEM_LARGE_VILLAGE == instance_of: place.set_type(PlaceType.VILLAGE) elif ITEM_VILLAGE == instance_of: place.set_type(PlaceType.VILLAGE) elif ITEM_URBAN_AREA_IN_SWEDEN == instance_of: place.set_type(PlaceType.VILLAGE) elif ITEM_HAMLET == instance_of: place.set_type(PlaceType.HAMLET) elif ITEM_FARM == instance_of: place.set_type(PlaceType.FARM) elif ITEM_BUILDING == instance_of: place.set_type(PlaceType.BUILDING) if PROPERTY_COORDINATE_LOCATION in claims_groups: for claim in claims_groups[PROPERTY_COORDINATE_LOCATION]: datavalue = claim.mainsnak.datavalue place.set_latitude(str(datavalue.value['latitude'])) place.set_longitude(str(datavalue.value['longitude'])) extract_located_in(claims_groups, PROPERTY_LOCATED_IN_PRESENT, parents) extract_located_in(claims_groups, PROPERTY_LOCATED_IN_ADM, parents) extract_located_in(claims_groups, PROPERTY_LOCATED, parents) return place, parents
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty from qwikidata.linked_data_interface import get_entity_dict_from_api # create an item representing "Douglas Adams" Q_DOUGLAS_ADAMS = "Q42" q42_dict = get_entity_dict_from_api(Q_DOUGLAS_ADAMS) q42 = WikidataItem(q42_dict) # create a property representing "subclass of" P_SUBCLASS_OF = "P279" p279_dict = get_entity_dict_from_api(P_SUBCLASS_OF) p279 = WikidataProperty(p279_dict) # create a lexeme representing "bank" L_BANK = "L3354" l3354_dict = get_entity_dict_from_api(L_BANK) l3354 = WikidataLexeme(l3354_dict)
codeCount += 1 print('Processed', codeCount, 'Q-codes.') timeStart = perf_counter() cnt = 0 for name in codeList: cnt += 1 try: personDict = get_entity_dict_from_api(name) # Insert QCode here except: missingCodes = open("missingCodes.txt", "a") missingCodes.write(name + '\n') continue person = WikidataItem(personDict) claim_groups = person.get_truthy_claim_groups( ) # Gets a person's different Wikidata attributes try: eduGroups = claim_groups[ "P69"] # Grabs person's education from those attributes foundCount += 1 except: print(str(cnt) + ".", "Education not there for", person.get_label()) missingCount += 1 if (cnt % 10 == 0): readyToAppend = True continue eduEntries = len(eduGroups) # How many different entries there are