예제 #1
0
 def test_get_claim_1(self) -> None:
     """Assert correct behavior."""
     q42_dict = _load_item_dict(typedefs.ItemId("Q42"))
     given_name_douglas = "Q463035"
     given_name_noel = "Q19688263"
     item = WikidataItem(q42_dict)
     claim_group = item.get_claim_group(typedefs.PropertyId("P735"))
     assert len(claim_group) == 2
     given_names = set(
         [cl.mainsnak.datavalue.value["id"] for cl in claim_group])
     assert given_names == set([given_name_douglas, given_name_noel])
예제 #2
0
def has_occupation_politician(item: WikidataItem, truthy: bool = True) -> bool:
    """Return True if the Wikidata Item has occupation politician."""
    if truthy:
        claim_group = item.get_truthy_claim_group(P_OCCUPATION)
    else:
        claim_group = item.get_claim_group(P_OCCUPATION)

    occupation_qids = [
        claim.mainsnak.datavalue.value["id"]
        for claim in claim_group
        if claim.mainsnak.snaktype == "value"
    ]
    return Q_POLITICIAN in occupation_qids
예제 #3
0
 def test_get_truthy_claim_1(self) -> None:
     """Assert correct behavior with one preferred and one normal."""
     q42_dict = _load_item_dict(typedefs.ItemId("Q42"))
     given_name_douglas = "Q463035"
     item = WikidataItem(q42_dict)
     truthy_claim_group = item.get_truthy_claim_group(
         typedefs.PropertyId("P735"))
     assert len(truthy_claim_group) == 1
     claim = truthy_claim_group[0]
     mainsnak = claim.mainsnak
     datavalue = mainsnak.datavalue
     assert isinstance(datavalue, WikibaseEntityId)
     qid = datavalue.value["id"]
     assert qid == given_name_douglas
예제 #4
0
def get_matched_wkd_entities(wktitles: List[str]) -> Dict[str, WikidataItem]:
    def is_matched(q: WikidataItem) -> bool:
        # 確認是否有中文名
        if q.get_label("zh") == "":
            print(f'Skip, no zh label: {q.get_enwiki_title()}')
            return False

        # entity不能是人
        cg = q.get_claim_group("P31")  # P31:instance_of
        instanceof = [c.mainsnak.datavalue.value['id'] for c in cg]
        if "Q5" in instanceof:  # Q5:human
            print(f'Skip, is a person: {q.get_enwiki_title()}')
            return False

        # entity不能有位置claim
        cg = q.get_claim_group("P625")  # P625:coordinate_location
        if cg.property_id is not None:
            print(f'Skip, has coordinate location: {q.get_enwiki_title()}')
            return False
        return True

    matched = dict()
    for wktitle in wktitles:
        try:
            doc = es.WikiData.get(id=wktitle)
            q = WikidataItem(json.loads(doc.json))
            if is_matched(q):
                matched[wktitle] = q
        except elasticsearch.NotFoundError:
            print(f"Not found wikidata: {wktitle}")

    return matched
예제 #5
0
async def get_fact(query, args, tokenizer, trex_set, common_vocab, f_out):
    """
    Collect more facts for the TREx-train set from LPAQA
    """
    line = query.strip().split('\t')
    sub_url, sub, obj_url, obj = line
    sub_id = get_id_from_url(sub_url)
    obj_id = get_id_from_url(obj_url)

    # First, make sure fact is not in TREx test set
    if (sub_id, obj_id) in trex_set:
        return

    # Make sure object is a single token
    if len(tokenizer.tokenize(obj)) != 1:
        return

    # Make sure object is in common vocab subset
    if obj not in common_vocab:
        return

    # Make sure subject is prominent (has a Wikipedia page)
    try:
        q_dict = get_entity_dict_from_api(sub_id)
        q = WikidataItem(q_dict)
        if not q.get_sitelinks():
            return
    except ValueError:
        return

    # Some entities don't have labels so the subject label is the URI
    if sub_id == sub:
        return

    # print('Writing fact: {} - {}', sub, obj)
    f_out.write(
        json.dumps({
            'sub_uri': sub_id,
            'obj_uri': obj_id,
            'sub_label': sub,
            'obj_label': obj
        }) + '\n')

    # Increment global count
    await increment_count()
예제 #6
0
        def process(ii, entity_dict):
            if args.start > ii:
                return []

            if entity_dict["type"] == "item":
                return self.handle_entity(WikidataItem(entity_dict))
            elif entity_dict["type"] == "property":
                return self.handle_property(WikidataProperty(entity_dict))
            else:
                return []
예제 #7
0
    def is_matched(q: WikidataItem) -> bool:
        # 確認是否有中文名
        if q.get_label("zh") == "":
            print(f'Skip, no zh label: {q.get_enwiki_title()}')
            return False

        # entity不能是人
        cg = q.get_claim_group("P31")  # P31:instance_of
        instanceof = [c.mainsnak.datavalue.value['id'] for c in cg]
        if "Q5" in instanceof:  # Q5:human
            print(f'Skip, is a person: {q.get_enwiki_title()}')
            return False

        # entity不能有位置claim
        cg = q.get_claim_group("P625")  # P625:coordinate_location
        if cg.property_id is not None:
            print(f'Skip, has coordinate location: {q.get_enwiki_title()}')
            return False
        return True
예제 #8
0
def import_from_wkddump(dump_path: str,
                        skip: int = 0,
                        first_n: int = None) -> None:
    for i, entity_dict in enumerate(WikidataJsonDump(dump_path)):
        if first_n is not None and i > first_n:
            print(f"Early stop at {first_n}")
            break
        if i < skip:
            continue
        if i % 10000 == 0:
            print(i)
        if entity_dict["type"] == "item":
            e = WikidataItem(entity_dict)
            doc = WikiData(
                en_title=e.get_enwiki_title(),
                wkd_id=e.entity_id,
                json=json.dumps(e._entity_dict),
            )
            doc.save()
예제 #9
0
    def test_get_aliases_1(self) -> None:
        """Assert correct behavior in get_aliases method."""
        q42_dict = _load_item_dict(typedefs.ItemId("Q42"))
        en_aliases = [el["value"] for el in q42_dict["aliases"][EN]]
        de_aliases = [el["value"] for el in q42_dict["aliases"][DE]]
        item = WikidataItem(q42_dict)

        assert item.get_aliases() == en_aliases
        assert item.get_aliases(lang=EN) == en_aliases
        assert item.get_aliases(lang=DE) == de_aliases
        assert item.get_aliases(lang=NO) == []

        p279_dict = _load_property_dict(typedefs.PropertyId("P279"))
        en_aliases = [el["value"] for el in p279_dict["aliases"][EN]]
        de_aliases = [el["value"] for el in p279_dict["aliases"][DE]]
        prop = WikidataProperty(p279_dict)

        assert prop.get_aliases() == en_aliases
        assert prop.get_aliases(lang=EN) == en_aliases
        assert prop.get_aliases(lang=DE) == de_aliases
        assert prop.get_aliases(lang=NO) == []
예제 #10
0
    def test_get_description_1(self) -> None:
        """Assert correct behavior in get_description method."""
        q42_dict = _load_item_dict(typedefs.ItemId("Q42"))
        en_description = q42_dict["descriptions"][EN]["value"]
        de_description = q42_dict["descriptions"][DE]["value"]
        item = WikidataItem(q42_dict)

        assert item.get_description() == en_description
        assert item.get_description(lang=EN) == en_description
        assert item.get_description(lang=DE) == de_description
        assert item.get_description(lang=NO) == ""

        p279_dict = _load_property_dict(typedefs.PropertyId("P279"))
        en_description = p279_dict["descriptions"][EN]["value"]
        de_description = p279_dict["descriptions"][DE]["value"]
        prop = WikidataProperty(p279_dict)

        assert prop.get_description() == en_description
        assert prop.get_description(lang=EN) == en_description
        assert prop.get_description(lang=DE) == de_description
        assert prop.get_description(lang=NO) == ""
예제 #11
0
    def test_get_label_1(self) -> None:
        """Assert correct behavior in get_label method."""
        q42_dict = _load_item_dict(types.ItemId("Q42"))
        en_label = q42_dict["labels"][EN]["value"]
        de_label = q42_dict["labels"][DE]["value"]
        item = WikidataItem(q42_dict)

        assert item.get_label() == en_label
        assert item.get_label(lang=EN) == en_label
        assert item.get_label(lang=DE) == de_label
        assert item.get_label(lang=NO) == ""

        p279_dict = _load_property_dict(types.PropertyId("P279"))
        en_label = p279_dict["labels"][EN]["value"]
        de_label = p279_dict["labels"][DE]["value"]
        prop = WikidataProperty(p279_dict)

        assert prop.get_label() == en_label
        assert prop.get_label(lang=EN) == en_label
        assert prop.get_label(lang=DE) == de_label
        assert prop.get_label(lang=NO) == ""
def main(search_term):
    wikipedia = MediaWiki(lang='pap', user_agent='code-for-nl-pap-parser')
    wikidata = MediaWiki(url='https://www.wikidata.org/w/api.php',
                         user_agent='code-for-nl-pap-parser')

    search_result = wikipedia.search(search_term, results=4)

    for result_item in search_result:
        page = wikipedia.page(result_item)
        print(
            'I found page \'%s\' for term \'%s\'' % (result_item, search_term),
            'with categories', '/'.join(page.categories),
            'https://pap.wikipedia.org/wiki/' +
            urllib.parse.quote(result_item))
        # print(page.images)

        # Now I am going to search this one on wikidata, this will return a code. like Q215887
        search_data = wikidata.search(result_item, results=1)

        for data_item in search_data:
            Q_CODE = data_item
            print(result_item, 'is known on wikidata with the code', Q_CODE,
                  'https://www.wikidata.org/wiki/' + Q_CODE)
            # Now try the qwikidata interface
            entity = get_entity_dict_from_api(Q_CODE)
            q = WikidataItem(entity)
            pap_data_label = q.get_label(lang='pap')
            nl_data_label = q.get_label(lang='nl')
            if pap_data_label and nl_data_label:
                # First get the page. Read the images found
                data_page = wikidata.page(result_item)
                # print(data_page.images)

                print(pap_data_label, 'is called', nl_data_label, 'in dutch')
            elif pap_data_label and not nl_data_label:
                print(pap_data_label, 'has no entry for dutch!')
            elif not pap_data_label and nl_data_label:
                print(Q_CODE, 'does not match papiamentu entry')
            elif not pap_data_label and not nl_data_label:
                print(pap_data_label, 'has no entry for dutch or papiamentu!')
예제 #13
0
def fill(X):
    name = X['name']
    if name is NaN:
        if 'official_name' in X['tags']:
            name = X['tags']['official_name']
        elif 'operator' in X['tags']:
            name = X['tags']['operator']
        elif 'brand:wikidata' in X['tags']:
            wikidata = X['tags']['brand:wikidata']
            q_dict = get_entity_dict_from_api(wikidata)
            name = WikidataItem(q_dict).get_label()
        elif 'brand:wikipedia' in X['tags']:
            wikipedia = X['tags']['brand:wikipedia']
            name = wikipedia[3:]
    return name
예제 #14
0
        if claim.mainsnak.snaktype == "value"
    ]
    return Q_POLITICIAN in occupation_qids


# create an instance of WikidataJsonDump
wjd_dump_path = "wikidata-20190401-all.json.bz2"
wjd = WikidataJsonDump(wjd_dump_path)

# create an iterable of WikidataItem representing politicians
politicians = []
t1 = time.time()
for ii, entity_dict in enumerate(wjd):

    if entity_dict["type"] == "item":
        entity = WikidataItem(entity_dict)
        if has_occupation_politician(entity):
            politicians.append(entity)

    if ii % 1000 == 0:
        t2 = time.time()
        dt = t2 - t1
        print(
            "found {} politicians among {} entities [entities/s: {:.2f}]".format(
                len(politicians), ii, ii / dt
            )
        )

    if ii > 10000:
        break
for monument in monument_list:
    count_mon += 1
    print(count_mon)
    if count_mon % 4 == 0:
        print("HELLO")
        partition_num = str(count_mon / 4).split('.')[0]
        with open(
                '../../Downloads/hi_monument_english_labels' + partition_num +
                '.json', 'w') as fout:
            json.dump(complete_final_monument_list, fout)
        complete_final_monument_list = []
        print("Checkpoint %d reached, JSON dumps saved |" % (count_mon / 4))

    for key, val in monument.items():
        if key == "title":
            monument['title'] = WikidataItem(get_entity_dict_from_api(
                str(val))).get_label()
        elif key == "id":
            monument['id'] = WikidataItem(get_entity_dict_from_api(
                str(val))).get_label()

    for key in monument['claims'].keys():
        if (key == "P727"):
            continue
        else:
            property_list.append(
                WikidataProperty(get_entity_dict_from_api(key)).get_label())
    final_monument_list = dict(
        zip(property_list, list(monument['claims'].values())))
    monument['claims'].clear()
    monument['claims'].update(final_monument_list)
    property_list = []
예제 #16
0
def get_place_from_wikidata(entity_id):
    parents = set()
    entity = WikidataItem(get_entity_dict_from_api(entity_id))
    claims_groups = entity.get_truthy_claim_groups()
    place = Place()
    place.set_gramps_id(entity_id)

    name = PlaceName()
    name.set_language('sv')
    name.set_value(entity.get_label('sv'))
    place.set_name(name=name)

    place.set_title(entity.get_label('sv'))
    for lang in ['sv', 'en', 'de', 'fi', 'no', 'nn', 'da', 'se']:
        wiki_name = entity.get_label(lang)
        if len(wiki_name):
            place_name = PlaceName()
            place_name.set_language(lang)
            place_name.set_value(wiki_name)
            place.add_alternative_name(name=place_name)
            for alias in entity.get_aliases(lang):
                alt_name = PlaceName()
                alt_name.set_language(lang)
                alt_name.set_value(alias)
                place.add_alternative_name(name=alt_name)

        for link in entity.get_sitelinks(lang).values():
            wikipedia_url = Url()
            wikipedia_url.set_path(link['url'])
            wikipedia_url.set_type('Wikipedia entry')
            wikipedia_url.set_description('Wikipedia %s:%s' %
                                          (link["title"], link["site"]))
            place.add_url(wikipedia_url)

    # Instance of -> PlaceType
    if PROPERTY_INSTANCE_OF in claims_groups:
        for claim in claims_groups[PROPERTY_INSTANCE_OF]:
            instance_of = claim.mainsnak.datavalue.value['id']
            if ITEM_PARISH == instance_of:
                place.set_type(PlaceType.PARISH)
            elif ITEM_SOCKEN == instance_of:
                place.set_type(PlaceType.PARISH)
            elif ITEM_ISLAND == instance_of:
                place.set_type(PlaceType.UNKNOWN)  # No islands in Gramps
            elif ITEM_MUNICIPALITY_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.MUNICIPALITY)
            elif ITEM_MUNICIPALITY == instance_of:
                place.set_type(PlaceType.MUNICIPALITY)
            elif ITEM_COUNTRY == instance_of:
                place.set_type(PlaceType.COUNTRY)
            elif ITEM_SOVEREIGN_STATE == instance_of:
                place.set_type(PlaceType.COUNTRY)
            elif ITEM_STATE_OF_US == instance_of:
                place.set_type(PlaceType.STATE)
            elif ITEM_FEDERAL_STATE == instance_of:
                place.set_type(PlaceType.STATE)
            elif ITEM_COUNTY == instance_of:
                place.set_type(PlaceType.COUNTY)
            elif ITEM_COUNTY_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.COUNTY)
            elif ITEM_FORMER_COUNTY_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.COUNTY)
            elif ITEM_PROVINCE_OF_SWEDEN == instance_of:
                place.set_type(PlaceType.PROVINCE)
            elif ITEM_PROVINCE == instance_of:
                place.set_type(PlaceType.PROVINCE)
            elif ITEM_ADM_REGION == instance_of:
                place.set_type(PlaceType.REGION)
            elif ITEM_NEIGHBORHOOD == instance_of:
                place.set_type(PlaceType.NEIGHBORHOOD)
            elif ITEM_DISTRICT == instance_of:
                place.set_type(PlaceType.DISTRICT)
            elif ITEM_BOROUGH == instance_of:
                place.set_type(PlaceType.BOROUGH)
            elif ITEM_TOWN == instance_of:
                place.set_type(PlaceType.TOWN)
            elif ITEM_LARGE_VILLAGE == instance_of:
                place.set_type(PlaceType.VILLAGE)
            elif ITEM_VILLAGE == instance_of:
                place.set_type(PlaceType.VILLAGE)
            elif ITEM_URBAN_AREA_IN_SWEDEN == instance_of:
                place.set_type(PlaceType.VILLAGE)
            elif ITEM_HAMLET == instance_of:
                place.set_type(PlaceType.HAMLET)
            elif ITEM_FARM == instance_of:
                place.set_type(PlaceType.FARM)
            elif ITEM_BUILDING == instance_of:
                place.set_type(PlaceType.BUILDING)

    if PROPERTY_COORDINATE_LOCATION in claims_groups:
        for claim in claims_groups[PROPERTY_COORDINATE_LOCATION]:
            datavalue = claim.mainsnak.datavalue
            place.set_latitude(str(datavalue.value['latitude']))
            place.set_longitude(str(datavalue.value['longitude']))

    extract_located_in(claims_groups, PROPERTY_LOCATED_IN_PRESENT, parents)
    extract_located_in(claims_groups, PROPERTY_LOCATED_IN_ADM, parents)
    extract_located_in(claims_groups, PROPERTY_LOCATED, parents)

    return place, parents
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

# create an item representing "Douglas Adams"
Q_DOUGLAS_ADAMS = "Q42"
q42_dict = get_entity_dict_from_api(Q_DOUGLAS_ADAMS)
q42 = WikidataItem(q42_dict)

# create a property representing "subclass of"
P_SUBCLASS_OF = "P279"
p279_dict = get_entity_dict_from_api(P_SUBCLASS_OF)
p279 = WikidataProperty(p279_dict)

# create a lexeme representing "bank"
L_BANK = "L3354"
l3354_dict = get_entity_dict_from_api(L_BANK)
l3354 = WikidataLexeme(l3354_dict)
예제 #18
0
    codeCount += 1

print('Processed', codeCount, 'Q-codes.')
timeStart = perf_counter()

cnt = 0
for name in codeList:
    cnt += 1

    try:
        personDict = get_entity_dict_from_api(name)  # Insert QCode here
    except:
        missingCodes = open("missingCodes.txt", "a")
        missingCodes.write(name + '\n')
        continue
    person = WikidataItem(personDict)

    claim_groups = person.get_truthy_claim_groups(
    )  # Gets a person's different Wikidata attributes
    try:
        eduGroups = claim_groups[
            "P69"]  # Grabs person's education from those attributes
        foundCount += 1
    except:
        print(str(cnt) + ".", "Education not there for", person.get_label())
        missingCount += 1
        if (cnt % 10 == 0):
            readyToAppend = True
        continue
    eduEntries = len(eduGroups)  # How many different entries there are