def get_gnid(rec): if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" + rec["geo"].get("latitude") + "&lng=" + rec["geo"].get("longitude") + "&username=slublod") if r.ok and isiter(r.json().get("geonames")): for geoNameRecord in r.json().get("geonames"): if rec.get("name") in geoNameRecord.get( "name") or geoNameRecord.get("name") in rec.get( "name"): #match! rec["sameAs"] = litter( rec.get("sameAs"), "http://www.geonames.org/" + str(geoNameRecord.get("geonameId")) + "/") changed = True else: if r.json().get("status").get("message").startswith( "the hourly limit") or r.json().get("status").get( "message").startswith("the daily limit"): eprint("Limit exceeded!\n") exit(0) if changed: return rec
def get_wdid(_ids, rec): """ gets an list of sameAs Links, e.g. ['https://d-nb.info/gnd/118827545', 'http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=035143010', 'http://catalogue.bnf.fr/ark:/12148/cb119027159', 'http://id.loc.gov/rwo/agents/n50002729', 'http://isni.org/isni/0000000120960218', 'http://viaf.org/viaf/44298691'] """ if not isinstance(_ids, list): return None changed = False url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" # Define header according to wikidata's User-Agent policy # see: https://meta.wikimedia.org/wiki/User-Agent_policy headers = { 'User-Agent': 'efre-lod-enrich-wikidata-bot/0.1 ' '(https://github.com/slub/esmarc) ' 'python-requests/2.22' } or_mapping = [] for _id in _ids: for key, value in lookup_table_wdProperty.items(): if _id.startswith(key): or_mapping.append("?item wdt:{Property} \"{value}\"".format( Property=value["property"], value=_id.split(value["delim"])[-1])) break if or_mapping: # BUILD an SPARQL OR Query with an UNION Operator. # Still builds an normal query without UNION when or_mapping List only contains one element query = '''SELECT DISTINCT ?item \nWHERE {{\n\t{{ {UNION} }}\n}}'''.format( UNION="} UNION\n\t\t {".join(or_mapping)) data = requests.get(url, headers=headers, params={ 'query': query, 'format': 'json' }) if data.ok and len(data.json().get("results").get("bindings")) > 0: for item in data.json().get("results").get("bindings"): rec["sameAs"] = litter( rec["sameAs"], { "@id": item.get("item").get("value"), "publisher": { "@id": "https://www.wikidata.org/wiki/Q2013", "abbr": "WIKIDATA", "preferredName": "Wikidata" }, "isBasedOn": { "@type": "Dataset", "@id": item.get("item").get("value") } }) changed = True elif not data.ok: eprint("wikidata: Connection Error {status}: \'{message}\'".format( status=data.status_code, message=data.content)) if changed: return rec
def get_subfields(jline,key,entity): data=[] if isinstance(key,list): for k in key: data=litter(data,get_subfield(jline,k,entity)) return ArrayOrSingleValue(data) elif isinstance(key,str): return get_subfield(jline,key,entity) else: return
def get_subfield_if_4(jline,key,entity): #e.g. split "551^4:orta" to 551 and orta marcfield=key.rsplit("^")[0] subfield4=key.rsplit("^")[1] data=[] if marcfield in jline: for array in jline[marcfield]: for k,v in array.items(): sset={} for subfield in v: for subfield_code in subfield: sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code]) if sset.get("4") and subfield4 in sset.get("4"): newrecord=copy.deepcopy(jline) for i,subtype in enumerate(newrecord[marcfield]): for elem in subtype.get("__"): if elem.get("4") and subfield4!=elem["4"]: del newrecord[marcfield][i]["__"] data=litter(get_subfields(newrecord,marcfield,entity),data) if data: return ArrayOrSingleValue(data)
def handleaboutelem(attribut, obj): retobj = [] if obj.get("authority").lower() == "rvk" and obj.get("_"): for rvk in obj["_"].split(","): retobj = litter( retobj, { "@id": "https://rvk.uni-regensburg.de/api/json/ancestors/" + rvk.replace(" ", "%20").strip(), "identifier": { "@type": "PropertyValue", "propertyID": "RVK", "@value": rvk.strip() } }) elif obj.get("authority").lower() == "ddc" and obj.get("_"): for ddc in obj.get("_").split(","): retobj = litter( retobj, { "identifier": { "@type": "PropertyValue", "propertyID": "DDC", "@value": ddc.strip() }, "@id": "http://purl.org/NET/decimalised#c" + ddc.strip()[:3] }) elif obj.get("authority").lower() == "z" and obj.get("_"): newObj = {"@value": obj.get("_"), "@type": "schema:Text"} if obj.get("lang"): newObj["@language"] = obj["lang"] retobj = litter(retobj, newObj) elif obj.get("authority").lower() == "sswd" and obj.get("_"): retobj = litter(retobj, { "@value": obj.get("_"), "@type": "schema:Text", "@language": "ger" }) return retobj if retobj else None
def test_litter(): assert es2json.litter("foo", "bar") == ["foo", "bar"] assert es2json.litter(["foo", "bar"], "baz") == ["foo", "bar", "baz"] assert es2json.litter("baz", ["foo", "bar"]) == ["baz", "foo", "bar"] assert es2json.litter(None, ["foo", "bar", "baz"]) == ["foo", "bar", "baz"] assert es2json.litter(["foo", "foobar"], ["bar", "baz"]) == ["foo", "foobar", "bar", "baz"] assert es2json.litter(["foo", "foobar", "bar"], ["bar", "baz"]) == ["foo", "foobar", "bar", "baz"]
def get_gnid_by_es(rec, host, port, index, typ): if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False records = [] searchbody = { "query": { "bool": { "filter": { "geo_distance": { "distance": "0.1km", "location": { "lat": float(rec["geo"].get("latitude")), "lon": float(rec["geo"].get("longitude")) } } } } } } try: for record in esgenerator(headless=True, host=host, port=port, index=index, type=typ, body=searchbody): records.append(record) except elasticsearch.exceptions.RequestError as e: eprint(e, json.dumps(searchbody, indent=4), json.dumps(rec, indent=4)) return if records: for record in records: if record.get("name") in rec.get("name") or rec.get( "name") in record.get("name") or len( records) == 1 or rec.get("name") in record.get( "alternateName"): #eprint(rec.get("name"),record.get("name"),record.get("id"),record.get("location")) rec["sameAs"] = litter( rec.get("sameAs"), "http://www.geonames.org/" + str(record.get("id")) + "/") changed = True if changed: return rec else: return None
def getmarc(record,regex,entity): if "+" in regex: marcfield=regex[:3] if marcfield in record: subfields=regex.split(".")[-1].split("+") data=None for array in record.get(marcfield): for k,v in array.items(): sset={} for subfield in v: for subfield_code in subfield: sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code]) fullstr="" for sf in subfields: if sf in sset: if fullstr: fullstr+=". " if isinstance(sset[sf],str): fullstr+=sset[sf] elif isinstance(sset[sf],list): fullstr+=". ".join(sset[sf]) if fullstr: data=litter(data,fullstr) if data: return ArrayOrSingleValue(data) else: ret=[] if isinstance(regex,str): regex=[regex] for string in regex: if string[:3] in record: ret=litter(ret,ArrayOrSingleValue(list(getmarcvalues(record,string,entity)))) if ret: if isinstance(ret,list): #simple deduplizierung via uniq() ret = list(uniq(ret)) return ArrayOrSingleValue(ret)
def get_gnid(rec): """ Use geonames API (slow and quota limit for free accounts) """ if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" + rec["geo"].get("latitude") + "&lng=" + rec["geo"].get("longitude") + "&username=slublod") if r.ok and isiter(r.json().get("geonames")): for geoNameRecord in r.json().get("geonames"): if rec.get("name") in geoNameRecord.get( "name") or geoNameRecord.get("name") in rec.get( "name"): # match! newSameAs = { '@id': "https://sws.geonames.org/" + str(geoNameRecord.get("geonameId")) + "/", 'publisher': { 'abbr': "geonames", 'preferredName': "GeoNames", "isBasedOn": { "@type": "Dataset", "@id": "https://sws.geonames.org/" + str(record.get("id")) + "/" } } } rec["sameAs"] = litter(rec.get("sameAs"), newSameAs) changed = True else: if r.json().get("status").get("message").startswith( "the hourly limit") or r.json().get("status").get( "message").startswith("the daily limit"): eprint("Limit exceeded!\n") exit(0) if changed: return rec
def map_record(sourceRecord): record = {} for target_value, source_value in mapping.items(): if callable(source_value): key, value = source_value(target_value, sourceRecord) if value: if key == "__array__": for elem in value: if isinstance(elem, dict): for subkey, item in elem.items(): record[subkey] = litter( record.get(subkey), item) else: record[key] = value elif isinstance(source_value, str) and source_value in sourceRecord: record[target_value] = sourceRecord[source_value] if record: record["@context"] = "http://schema.org" return record
def marc_dates(record,event): recset={} if record: for indicator_level in record: for subfield in indicator_level: sset={} for sf_elem in indicator_level.get(subfield): for k,v in sf_elem.items(): if k=="a" or k=="4": sset[k]=litter(sset.get(k),v) if isinstance(sset.get("4"),str): sset["4"]=[sset.get("4")] if isinstance(sset.get("4"),list): for elem in sset.get("4"): if elem.startswith("dat"): recset[elem]=sset.get("a") if recset.get("datx"): return dateToEvent(recset["datx"],event) elif recset.get("datl"): return dateToEvent(recset["datl"],event) else: return None
def entityfacts(record,gnd,ef_instances): try: changed = False for url in ef_instances: r = get(url+str(gnd)) if r.ok: data=r.json() if data.get("_source"): data=data.get("_source") sameAsses=[] # ba-dum-ts if data.get("sameAs") and isinstance(data["sameAs"],list): for sameAs in data.get("sameAs"): if sameAs.get("@id"): if not sameAs.get("@id").startswith("http://d-nb.info"): sameAsses.append(sameAs.get("@id")) #print(sameAsses,url) if sameAsses: record["sameAs"]=litter(record.get("sameAs"),sameAsses) changed=True break return record if changed else None except Exception as e: time.sleep(5) return entityfacts(record,gnd,ef_instances)
def entityfacts(record, ef_instances): """ Function to harvest gnd entityfacts Look for connections to other entity providers in GND's entityfacts "sameAs" field :param record: json record probably containing GND entries in their "sameAs" list field :type record: json object :param ef_instances: entityfacts-URLs instances to query :type ef_instances: list of strings :returns: :rtype: json object """ # abbreviations used by GND entityfacts and their # analoy in SLUB LOD context abbreviations = { "DNB": "https://data.slub-dresden.de/organizations/514366265", "VIAF": "https://data.slub-dresden.de/organizations/100092306", "LC": "https://data.slub-dresden.de/organizations/100822142", "DDB": "https://data.slub-dresden.de/organizations/824631854", "WIKIDATA": "https://www.wikidata.org/wiki/Q2013", "BNF": "https://data.slub-dresden.de/organizations/188898441", "KXP": "https://data.slub-dresden.de/organizations/103302212", "dewiki": None, "enwiki": None, "DE-611": "https://data.slub-dresden.de/organizations/103675612", "geonames": None, "ISNI": None, "filmportal.de": None, "ORCID": None, "Portraitindex": None, "ARCHIV-D": None, "DE-M512": None, "ADB": None, "NDB": None, "OEBL": "https://data.slub-dresden.de/organizations/102972389", "CH_HLS": None, "LAGIS": "https://data.slub-dresden.de/organizations/100482600", "WIKISOURCE": None, "DE-28": "https://data.slub-dresden.de/organizations/100874770", "OSTDEBIB": None, "PACELLI": None, "FFMPL": "https://data.slub-dresden.de/organizations/236770764", "epidat": "https://data.slub-dresden.de/organizations/103039031", "BIOKLASOZ": "https://data.slub-dresden.de/organizations/100832873", "HISTORICUMNET": "https://data.slub-dresden.de/organizations/102398704" } if not isinstance(record.get("sameAs"), list): return None gnd_id = None for item in record.get("sameAs"): if "d-nb.info" in item["@id"] and len(item["@id"].split("/")) > 4: gnd_id = item["@id"].split("/")[-1] if not gnd_id: # no GND-ID - nothing to enrich return None old_rec_sameAs_len = len(str(record["sameAs"])) for url in ef_instances: r = requests.get(url + str(gnd_id)) if r.ok: data = r.json() else: # ID not found in the respective source # just continue continue sameAsses = [] # ba-dum-ts if data.get("_source"): # in Elasticsearch: data are in the "_source" field ef_sameAs = data.get("_source").get("sameAs") else: ef_sameAs = data.get("sameAs") if not ef_sameAs or not isinstance(ef_sameAs, list): continue for sameAs in ef_sameAs: id_ = sameAs.get("@id") # we can skip DNB-link as we already have it (and # used it to come here) if not id_ or id_.startswith("https://d-nb.info"): continue obj = { '@id': id_, 'publisher': { 'abbr': sameAs["collection"]["abbr"], 'preferredName': sameAs["collection"]["name"] }, 'isBasedOn': { '@type': "Dataset", '@id': "http://hub.culturegraph.org/entityfacts/{}".format(gnd_id) } } # replace id with SLUB LOD id's listed in abbreviations if obj["publisher"]["abbr"] in abbreviations: slub_id = abbreviations[obj["publisher"]["abbr"]] if slub_id: obj["publisher"]["@id"] = slub_id else: # unknown identifier, report into error log eprint("entityfacts: Abbr. {} not known [GND-ID: {}]".format( sameAs["collection"]["abbr"], gnd_id)) sameAsses.append(obj) if sameAsses: record["sameAs"] = litter(record.get("sameAs"), sameAsses) break # compare length of transformed record, if the new entry is larger # than the old one, it was updated new_rec_sameAs_len = len(str(record["sameAs"])) if new_rec_sameAs_len > old_rec_sameAs_len: return record elif new_rec_sameAs_len < old_rec_sameAs_len: eprint("entityfacts: new record shorter than old one… " "[GND-ID: {}]".format(gnd_id)) return None else: return None
def get_subfield(jline,key,entity): keymap={"100":"persons", "700":"persons", "500":"persons", "711":"events", "110":"organizations", "710":"organizations", "551":"geo", "689":"topics", "550":"topics", "551":"geo", "655":"topics", "830":"resources", } entityType=keymap.get(key) data=[] if key in jline: for array in jline[key]: for k,v in array.items(): sset={} for subfield in v: for subfield_code in subfield: sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code]) node={} for typ in ["D","d"]: if sset.get(typ): #http://www.dnb.de/SharedDocs/Downloads/DE/DNB/wir/marc21VereinbarungDatentauschTeil1.pdf?__blob=publicationFile Seite 14 node["@type"]="http://schema.org/" if sset.get(typ)=="p": node["@type"]+="Person" entityType="persons" elif sset.get(typ)=="b": node["@type"]+="Organization" entityType="organizations" elif sset.get(typ)=="f": node["@type"]+="Event" entityType="events" elif sset.get(typ)=="u": node["@type"]+="CreativeWork" elif sset.get(typ)=="g": node["@type"]+="Place" else: node.pop("@type") if entityType=="resources" and sset.get("w") and not sset.get("0"): sset["0"]=sset.get("w") if sset.get("0"): if isinstance(sset["0"],list) and entityType=="persons": for n,elem in enumerate(sset["0"]): if elem and "DE-576" in elem: sset["0"].pop(n) uri=gnd2uri(sset.get("0")) if isinstance(uri,str) and uri.startswith(base_id) and not entityType=="resources": node["@id"]=id2uri(uri,entityType) elif isinstance(uri,str) and uri.startswith(base_id) and entityType=="resources": node["sameAs"]=base_id+id2uri(uri,entityType).split("/")[-1] elif isinstance(uri,str) and uri.startswith("http") and not uri.startswith(base_id): node["sameAs"]=uri elif isinstance(uri,str): node["identifier"]=uri elif isinstance(uri,list): node["sameAs"]=None node["identifier"]=None for elem in uri: if isinstance(elem,str) and elem.startswith(base_id): #if key=="830": #Dirty Workaround for finc id #rsplit=elem.rsplit("=") #rsplit[-1]="0-"+rsplit[-1] #elem='='.join(rsplit) node["@id"]=id2uri(elem,entityType) elif isinstance(elem,str) and elem.startswith("http") and not elem.startswith(base_id): node["sameAs"]=litter(node["sameAs"],elem) elif elem: node["identifier"]=litter(node["identifier"],elem) if isinstance(sset.get("a"),str) and len(sset.get("a"))>1: node["name"]=sset.get("a") elif isinstance(sset.get("a"),list): for elem in sset.get("a"): if len(elem)>1: node["name"]=litter(node.get("name"),elem) if sset.get("v") and entityType=="resources": node["position"]=sset["v"] if sset.get("i"): node["description"]=sset["i"] if sset.get("n") and entityType=="events": node["position"]=sset["n"] for typ in ["D","d"]: if sset.get(typ): #http://www.dnb.de/SharedDocs/Downloads/DE/DNB/wir/marc21VereinbarungDatentauschTeil1.pdf?__blob=publicationFile Seite 14 node["@type"]="http://schema.org/" if sset.get(typ)=="p": node["@type"]+="Person" elif sset.get(typ)=="b": node["@type"]+="Organization" elif sset.get(typ)=="f": node["@type"]+="Event" elif sset.get(typ)=="u": node["@type"]+="CreativeWork" elif sset.get(typ)=="g": node["@type"]+="Place" else: node.pop("@type") if node: data=litter(data,node) #data.append(node) if data: return ArrayOrSingleValue(data)
def get_wpcategories(record): """ * iterates through all sameAs Links to extract the link(s) to the wiki-site * requests wikpedia categories linked to those links :returns None (if record has not been changed) enriched record (dict, if record has changed) :rtype dict """ wp_uri = None wp_title = None cc = None # countrycode changed = False retobj = {} for _id in [x["@id"] for x in record["sameAs"]]: if "wikipedia" in _id: wp_uri = _id wp_title = urllib.parse.unquote(wp_uri.split("/")[-1]) cc = wp_uri.split("/")[2].split(".")[0] headers = { 'User-Agent': 'lod-enrich-wikipedia-categories-bot/0.1' '(https://github.com/slub/esmarc) ' 'python-requests/2.22' } url = "https://{}.wikipedia.org/w/api.php".format(cc) wd_response = requests.get(url, headers=headers, params={ 'action': 'query', 'generator': 'categories', 'titles': wp_title, 'gcllimit': 500, 'prop': 'info', 'format': 'json' }) if not wd_response.ok: eprint("wikipedia-categories: Connection Error " "{status}: \'{message}\'".format( status=wd_response.status_code, message=wd_response.content)) return None # related wikipedia links: _base = "https://{}.wikipedia.org/wiki/".format(cc) try: pages = wd_response.json()["query"]["pages"] for page_id, page_data in pages.items(): _sameAs = _base + page_data["title"].replace(' ', '_') _id = _base + "?curid={}".format(page_id) # cutting off the substring 'Category:' or 'Kategorie:' from # the beginning of the title for the name field _name = ":".join(page_data["title"].split(":")[1:]) obj = {"@id": _id, "sameAs": _sameAs, "name": _name} retobj[cc] = litter(retobj.get(cc), obj) changed = True except KeyError: eprint("wikipedia-categories: Data Error for Record:\n" "{record}\'\n\'{wp_record}\'".format( record=record, wp_record=wd_response.content)) return None if changed: record["category"] = retobj return record return None
def process(record, gnd, server): change = False # [0] [1] [2] [3] [4,-1] # http: / / d-nb.info / gnd / 102859268X get the GND number record_url = "{}/gnd-records/record/{}".format(server, gnd) r = requests.get(record_url) if r.ok: for gndItem in map: if r.json().get("_source").get(gndItem): for elem in r.json().get("_source").get(gndItem): value = elem if isinstance(elem, str): elem = {"id": elem} if isinstance(elem, dict): if "id" in elem: newvalue = elem.get("id").split("/")[-1] value = elem.get("id") else: continue elif isinstance(elem, list): continue newabout = { "identifier": { "propertyID": gndItem, "@type": "PropertyValue", "value": newvalue } } if value.startswith("http"): newabout["@id"] = value if gndItem == "fieldOfStudy": fos = requests.get(server + "/gnd-records/record/" + newvalue) if fos.ok and fos.json().get("_source").get( "relatedDdcWithDegreeOfDeterminacy3"): newabout["identifier"] = [ newabout.pop("identifier") ] ddcs = fos.json().get("_source").get( "relatedDdcWithDegreeOfDeterminacy3") if isinstance(ddcs, dict): ddcs = [ddcs] if isinstance(ddcs, list): for ddc in ddcs: if isinstance(ddc, str): ddc = {"id": ddc} newabout["identifier"].append({ "@type": "PropertyValue", "propertyID": "DDC", "value": ddc.get("id").split("/")[-2][:3] }) newabout["@id"] = ddc.get("id") if fos.json().get("_source").get( "preferredNameForTheSubjectHeading"): newabout["name"] = fos.json().get( "_source").get( "preferredNameForTheSubjectHeading") elif gndItem == "gndSubjectCategory": url = server + "/gnd-subjects/subject/_search" gsc = requests.post( url, json={"query": { "match": { "@id.keyword": value } }}) if gsc.ok and gsc.json().get("hits").get("total") == 1: for hit in gsc.json().get("hits").get("hits"): newabout["name"] = " ".join( hit.get("_source").get("skos:prefLabel"). get("@value").replace("\n", "").split()) if not record.get("about"): record["about"] = newabout change = True else: plzAdd = True if isinstance(record.get("about"), dict) and record.get("about").get( "@id") and value not in record.get( "about").get("@id"): record["about"] = [record.pop("about")] elif isinstance(record.get("about"), list): for item in record.get("about"): if item.get("@id") and value in item.get( "@id"): plzAdd = False elif isinstance(item.get("identifier"), list): for ident_list_elem in item.get( "identifier"): if ident_list_elem.get( "@id" ) and value in ident_list_elem.get( "@id"): plzAdd = False if plzAdd: change = True record["about"] = litter(record["about"], newabout) return record if change else None
def relatedTo(jline,key,entity): #e.g. split "551^4:orta" to 551 and orta marcfield=key[:3] data=[] if marcfield in jline: for array in jline[marcfield]: for k,v in array.items(): sset={} for subfield in v: for subfield_code in subfield: sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code]) if isinstance(sset.get("9"),str) and sset.get("9") in marc2relation: node={} node["_key"]=marc2relation[sset["9"]] if sset.get("0"): uri=gnd2uri(sset.get("0")) if isinstance(uri,str) and uri.startswith(base_id): node["@id"]=id2uri(sset.get("0"),"persons") elif isinstance(uri,str) and uri.startswith("http") and not uri.startswith(base_id): node["sameAs"]=uri elif isinstance(uri,str): node["identifier"]=sset.get("0") elif isinstance(uri,list): node["sameAs"]=None node["identifier"]=None for elem in uri: if elem and isinstance(elem,str) and elem.startswith(base_id): node["@id"]=id2uri(elem.split("=")[-1],"persons") elif elem and isinstance(elem,str) and elem.startswith("http") and not elem.startswith(base_id): node["sameAs"]=litter(node["sameAs"],elem) else: node["identifier"]=litter(node["identifier"],elem) if sset.get("a"): node["name"]=sset.get("a") data.append(node) elif isinstance(sset.get("9"),list): node={} for elem in sset["9"]: if elem.startswith("v"): for k,v in marc2relation.items(): if k.lower() in elem.lower(): node["_key"]=v break elif [x for x in marc2relation if x.lower() in elem.lower()]: for x in marc2relation: if x.lower() in elem.lower(): node["_key"]=marc2relation[x] elif not node.get("_key"): node["_key"]="relatedTo" #eprint(elem,node) if sset.get("0"): uri=gnd2uri(sset.get("0")) if isinstance(uri,str) and uri.startswith(base_id): node["@id"]=id2uri(sset.get("0"),"persons") elif isinstance(uri,str) and uri.startswith("http") and not uri.startswith(base_id): node["sameAs"]=uri elif isinstance(uri,str): node["identifier"]=uri elif isinstance(uri,list): node["sameAs"]=None node["identifier"]=None for elem in uri: if elem and elem.startswith(base_id): node["@id"]=id2uri(elem.split("=")[-1],"persons") elif elem and elem.startswith("http") and not elem.startswith(base_id): node["sameAs"]=litter(node["sameAs"],elem) elif elem: node["identifier"]=litter(node["identifier"],elem) if sset.get("a"): node["name"]=sset.get("a") data.append(node) #eprint(node) if data: return ArrayOrSingleValue(data)
def process(record, dnb_uri, server): change = False # [0] [1] [2] [3] [4,-1] r = requests.get( server + "/gnd-records/record/" + str(dnb_uri.split("/")[-1]) ) # http: / / d-nb.info / gnd / 102859268X get the GND number if r.ok: for gndItem in map: if r.json().get("_source").get(gndItem): for elem in r.json().get("_source").get(gndItem): newabout = { "identifier": { "propertyID": gndItem, "@type": "PropertyValue", "value": elem.split("/")[-1] } } if elem.startswith("http"): newabout["@id"] = elem if gndItem == "fieldOfStudy": fos = requests.get(server + "/gnd-records/record/" + elem.split("/")[-1]) if fos.ok and fos.json().get("_source").get( "relatedDdcWithDegreeOfDeterminacy3"): newabout["identifier"] = [ newabout.pop("identifier") ] newabout["identifier"].append({ "@type": "PropertyValue", "propertyID": "DDC", "value": fos.json().get("_source").get( "relatedDdcWithDegreeOfDeterminacy3") [0].split("/")[-2][:3] }) if fos.json().get("_source").get( "preferredNameForTheSubjectHeading"): newabout["name"] = fos.json().get( "_source").get( "preferredNameForTheSubjectHeading")[0] newabout[ "@id"] = "http://purl.org/NET/decimalised#c" + fos.json( ).get("_source").get( "relatedDdcWithDegreeOfDeterminacy3" )[0].split("/")[-2][:3] elif gndItem == "gndSubjectCategory": url = server + "/gnd-subjects/subject/_search" gsc = requests.post( url, json={"query": { "match": { "@id.keyword": elem } }}) if gsc.ok and gsc.json().get("hits").get("total") == 1: for hit in gsc.json().get("hits").get("hits"): newabout["name"] = " ".join( hit.get("_source").get("skos:prefLabel"). get("@value").replace("\n", "").split()) if not record.get("about"): record["about"] = newabout change = True else: plzAdd = True #print(elem,record.get("about")) if isinstance(record.get("about"), dict) and record.get("about").get( "@id") and elem not in record.get( "about").get("@id"): record["about"] = [record.pop("about")] elif isinstance(record.get("about"), list): for item in record.get("about"): if item.get("@id") and elem in item.get("@id"): plzAdd = False elif isinstance(item.get("identifier"), list): for ident_list_elem in item.get( "identifier"): if ident_list_elem.get( "@id" ) and elem in ident_list_elem.get( "@id"): plzAdd = False if plzAdd: change = True record["about"] = litter(record["about"], newabout) return record if change else None
def get_gnid_by_es(rec, host, port, index, typ): """ Use local dump in Elasticsearch """ if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False records = [] searchbody = { "query": { "bool": { "filter": { "geo_distance": { "distance": "0.1km", "location": { "lat": float(rec["geo"].get("latitude")), "lon": float(rec["geo"].get("longitude")) } } } } } } try: for record in esgenerator(headless=True, host=host, port=port, index=index, type=typ, body=searchbody): if record.get("name") in rec.get("preferredName") or rec.get( "preferredName" ) in record.get("name") or len(records) == 1 or rec.get( "preferredName") in record.get("alternateName"): newSameAs = { '@id': "https://sws.geonames.org/" + str(record.get("id")) + "/", 'publisher': { 'abbr': "geonames", 'preferredName': "GeoNames", "isBasedOn": { "@type": "Dataset", "@id": "https://sws.geonames.org/" + str(record.get("id")) + "/" } } } rec["sameAs"] = litter(rec.get("sameAs"), newSameAs) changed = True except elasticsearch.exceptions.RequestError as e: eprint(e, json.dumps(searchbody, indent=4), json.dumps(rec, indent=4)) return if changed: return rec else: return None
def handleRoles(obj, rec): retobj = {} key = "" #eprint(json.dumps(ob)) role = "" kv_table = { "given": "givenName", "family": "familyName", "termsOfAddress": "honorificPrefix", 'ID': '@id', 'namePart': 'name', "date": "birthDate", "aut": "author", "edt": "contributor", "pbl": "publisher", "dgg": "sourceOrganisation", "prv": "provider", "rev": "contributor", "dgs": "contributor", "ctb": "contributor", "oth": "contributor", "red": "contributor", "ill": "illustrator", "fnd": "funder", "cmp": "composer", "ths": "instructor", "sad": "contributor", "trl": "translator", "art": "artist", "Den akademischen Grad verleihende / prüfende Institution": "sourceOrganisation", 'Den akademischen Grad verleihende Institution': "sourceOrganisation", #'Medizinische Fakultät':"sourceOrganisation", #'Universität Leipzig':"sourceOrganisation", 'Den akademischen Grad verleihende/prüfende Institution': "sourceOrganisation", } if isinstance(obj.get("role"), dict): for k, v in traverse(obj["role"], ""): if k == "roleTerm" and v.get("_"): if v["_"].strip( ) in kv_table: # using strip() to avoid this nonesense: e.g.: '_': '\n \n prv\n ' role = kv_table[v["_"].strip()] else: pass #_bla,_blubb = getUrn("None",rec) #eprint(_blubb) if role: retobj[role] = {} for key in obj: if key in kv_table and isinstance(obj[key], str): retobj[role][kv_table[key]] = obj[key] if key in kv_table and isinstance(obj[key], list): for elem in obj[key]: if elem.get("type") in kv_table: retobj[role][kv_table[elem["type"]]] = elem.get("_") else: pass if isinstance(obj.get("nameIdentifier"), list): for elem in obj["nameIdentifier"]: retobj[role]["sameAs"] = litter(retobj.get("sameAs"), handleIdentifiers(elem)) elif isinstance(obj.get("nameIdentifier"), dict): retobj[role]["sameAs"] = litter( retobj.get("sameAs"), handleIdentifiers(obj.get("nameIdentifier"))) if retobj[role].get("givenName") and retobj[role].get("familyName"): retobj[role]["name"] = retobj[role].pop( "familyName") + ", " + retobj[role].pop("givenName") #if obj.get("role") and obj["role"].get("roleTerm") and isinstance(obj["role"]["roleTerm"],dict) and obj["role"]["roleTerm"]["_"]=="aut": #key=="author" #elif obj.get("role") and obj["role"].get("roleTerm") and isinstance(obj["role"]["roleTerm"],dict) and obj["role"]["roleTerm"]["_"]=="prv": #key=="provider" #eprint(retobj) return retobj
def get_wpinfo(record): """ * iterates through all sameAs Links to extract a wikidata-ID * requests wikipedia sites connected to the wd-Id * enriches wikipedia sites if they are within lookup_table_wpSites (i.e. currently german, english, polish, czech) * if we get an new wikipedia link from wikidata, but we already got an old entry from other as obsolete defined sources, we delete the obsolete entry and append the new entry * enriches multilingual names if they are within lookup_table_wpSites :returns None (if record has not been changed) enriched record (dict, if record has changed) :rtype dict """ wd_uri = None wd_id = None for _id in [x["@id"] for x in record["sameAs"]]: if "wikidata" in _id: wd_uri = _id wd_id = wd_uri.split("/")[-1] break if not wd_id: return None headers = { 'User-Agent': 'efre-lod-enrich-wikipedia-bot/0.1 ' '(https://github.com/slub/esmarc) ' 'python-requests/2.22' } site_filter_param = '|'.join([x for x in lookup_table_wpSites]) wd_response = requests.get("https://www.wikidata.org/w/api.php", headers=headers, params={ 'action': 'wbgetentities', 'ids': wd_id, 'props': 'sitelinks/urls', 'format': 'json', 'sitefilter': site_filter_param }) if not wd_response.ok: eprint("wikipedia: Connection Error {status}: \'{message}\'".format( status=wd_response.status_code, message=wd_response.content)) return None # related wikipedia links: try: sites = wd_response.json()["entities"][wd_id]["sitelinks"] except KeyError: eprint("wikipedia: Data Error for Record:\n" "\'{record}\'\n\'{wp_record}\'".format( record=record, wp_record=wd_response.content)) return None # list of all abbreviations for publisher in record's sameAs abbrevs = build_abbrevs(record["sameAs"]) changed = False for wpAbbr, info in sites.items(): if wpAbbr in lookup_table_wpSites: wikip_url = info["url"] newSameAs = { "@id": wikip_url, "publisher": lookup_table_wpSites[wpAbbr], "isBasedOn": { "@type": "Dataset", "@id": wd_uri } } # wikipedia sameAs link enrichment if wpAbbr not in abbrevs: record["sameAs"].append(newSameAs) changed = True # we already got an wikipedia link for that language, but the # originating data source is obsolete, so we update elif abbrevs.get( wpAbbr) and abbrevs[wpAbbr]["host"] in obsolete_isBasedOns: record["sameAs"][abbrevs[wpAbbr]["pos"]] = newSameAs changed = True # multilingual name object enrichment if not record.get("name"): record["name"] = {} cc = wpAbbr[:2] # countrycode if cc not in record["name"]: record["name"][cc] = [info["title"]] changed = True if info["title"] not in record["name"][cc]: record["name"][cc] = litter(record["name"][cc], info["title"]) changed = True if changed: return record