def fixRecord(record="", record_id=0, validation=False, replaceMethod='decimal'): replaceMethods = { 'decimal': (('#29;', '#30;', '#31;'), ("\x1D", "\x1E", "\x1F")), 'unicode': (('\u001d', '\u001e', '\u001f'), ("\x1D", "\x1E", "\x1F")), 'hex': (('\x1D', '\x1E', '\x1F'), ("\x1D", "\x1E", "\x1F")) } marcFullRecordFixed = record for i in range(0, 3): marcFullRecordFixed = marcFullRecordFixed.replace( replaceMethods.get(replaceMethod)[0][i], replaceMethods.get(replaceMethod)[1][i]) if validation: try: reader = pymarc.MARCReader(marcFullRecordFixed.encode('utf8'), utf8_handling='replace') marcrecord = next(reader) except (RecordLengthInvalid, RecordLeaderInvalid, BaseAddressNotFound, BaseAddressInvalid, RecordDirectoryInvalid, NoFieldsFound, UnicodeDecodeError) as e: eprint("record id {0}:".format(record_id) + str(e)) with open('invalid_records.txt', 'a') as error: #file_out.pluserror() eprint(marcFullRecordFixed, file=error) return None return marcFullRecordFixed
def get_gnid(rec): if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" + rec["geo"].get("latitude") + "&lng=" + rec["geo"].get("longitude") + "&username=slublod") if r.ok and isiter(r.json().get("geonames")): for geoNameRecord in r.json().get("geonames"): if rec.get("name") in geoNameRecord.get( "name") or geoNameRecord.get("name") in rec.get( "name"): #match! rec["sameAs"] = litter( rec.get("sameAs"), "http://www.geonames.org/" + str(geoNameRecord.get("geonameId")) + "/") changed = True else: if r.json().get("status").get("message").startswith( "the hourly limit") or r.json().get("status").get( "message").startswith("the daily limit"): eprint("Limit exceeded!\n") exit(0) if changed: return rec
def main(): try: for record in MARCReader(sys.stdin.buffer.read(), to_unicode=True): sys.stdout.write(json.dumps(transpose_to_ldj(record)) + "\n") sys.stdout.flush() except UnicodeDecodeError as e: eprint("unicode decode error: {}".format(e)) eprint(record)
def get_wdid(_ids, rec): """ gets an list of sameAs Links, e.g. ['https://d-nb.info/gnd/118827545', 'http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=035143010', 'http://catalogue.bnf.fr/ark:/12148/cb119027159', 'http://id.loc.gov/rwo/agents/n50002729', 'http://isni.org/isni/0000000120960218', 'http://viaf.org/viaf/44298691'] """ if not isinstance(_ids, list): return None changed = False url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql" # Define header according to wikidata's User-Agent policy # see: https://meta.wikimedia.org/wiki/User-Agent_policy headers = { 'User-Agent': 'efre-lod-enrich-wikidata-bot/0.1 ' '(https://github.com/slub/esmarc) ' 'python-requests/2.22' } or_mapping = [] for _id in _ids: for key, value in lookup_table_wdProperty.items(): if _id.startswith(key): or_mapping.append("?item wdt:{Property} \"{value}\"".format( Property=value["property"], value=_id.split(value["delim"])[-1])) break if or_mapping: # BUILD an SPARQL OR Query with an UNION Operator. # Still builds an normal query without UNION when or_mapping List only contains one element query = '''SELECT DISTINCT ?item \nWHERE {{\n\t{{ {UNION} }}\n}}'''.format( UNION="} UNION\n\t\t {".join(or_mapping)) data = requests.get(url, headers=headers, params={ 'query': query, 'format': 'json' }) if data.ok and len(data.json().get("results").get("bindings")) > 0: for item in data.json().get("results").get("bindings"): rec["sameAs"] = litter( rec["sameAs"], { "@id": item.get("item").get("value"), "publisher": { "@id": "https://www.wikidata.org/wiki/Q2013", "abbr": "WIKIDATA", "preferredName": "Wikidata" }, "isBasedOn": { "@type": "Dataset", "@id": item.get("item").get("value") } }) changed = True elif not data.ok: eprint("wikidata: Connection Error {status}: \'{message}\'".format( status=data.status_code, message=data.content)) if changed: return rec
def fix_mrc_id(jline): if "001" in jline and isinstance(jline["001"],list): _id=jline.pop("001") for elem in _id: jline["001"]=elem if elem=="0021114284" or len(elem)>512: # this particulary FINC-MARC21 Record is broken and will break the whole toolchain eprint(elem) return None return jline
def handleIdentifiers(obj): if "typeURI" in obj: return obj["typeURI"] + "/" + obj.get("_") elif obj.get("_") and obj.get("_").startswith("http"): return obj.get("_") elif not "typeURI" in obj and isinstance( obj.get("type"), str) and obj.get("type").lower() == 'gnd': return "http://d-nb.info/gnd/" + obj.get("_") elif not "typeURI" in obj and not "type" in obj and obj.get( "authority").lower() == "gnd": return "http://d-nb.info/gnd/" + obj.get("_") else: eprint(obj) return None
def get_gnid_by_es(rec, host, port, index, typ): if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False records = [] searchbody = { "query": { "bool": { "filter": { "geo_distance": { "distance": "0.1km", "location": { "lat": float(rec["geo"].get("latitude")), "lon": float(rec["geo"].get("longitude")) } } } } } } try: for record in esgenerator(headless=True, host=host, port=port, index=index, type=typ, body=searchbody): records.append(record) except elasticsearch.exceptions.RequestError as e: eprint(e, json.dumps(searchbody, indent=4), json.dumps(rec, indent=4)) return if records: for record in records: if record.get("name") in rec.get("name") or rec.get( "name") in record.get("name") or len( records) == 1 or rec.get("name") in record.get( "alternateName"): #eprint(rec.get("name"),record.get("name"),record.get("id"),record.get("location")) rec["sameAs"] = litter( rec.get("sameAs"), "http://www.geonames.org/" + str(record.get("id")) + "/") changed = True if changed: return rec else: return None
def run(fd): try: for record in MARCReader(fd): try: yield transpose_to_ldj(record) except AttributeError as e: eprint("attribut error: {}".format(e)) eprint(record) continue except UnicodeDecodeError as e: eprint("unicode decode error: {}".format(e)) eprint(record)
def main(): try: for record in MARCReader(sys.stdin.buffer.read(), to_unicode=True): sys.stdout.write( json.dumps(transpose_to_ldj(record), sort_keys=True) + "\n") sys.stdout.flush() except UnicodeDecodeError as e: eprint("unicode decode error: {}".format(e)) eprint(record) except pymarc.exceptions.RecordLengthInvalid as e: eprint("Invalid Record Length error: {}".format(e)) eprint(record)
def get_gnid(rec): """ Use geonames API (slow and quota limit for free accounts) """ if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" + rec["geo"].get("latitude") + "&lng=" + rec["geo"].get("longitude") + "&username=slublod") if r.ok and isiter(r.json().get("geonames")): for geoNameRecord in r.json().get("geonames"): if rec.get("name") in geoNameRecord.get( "name") or geoNameRecord.get("name") in rec.get( "name"): # match! newSameAs = { '@id': "https://sws.geonames.org/" + str(geoNameRecord.get("geonameId")) + "/", 'publisher': { 'abbr': "geonames", 'preferredName': "GeoNames", "isBasedOn": { "@type": "Dataset", "@id": "https://sws.geonames.org/" + str(record.get("id")) + "/" } } } rec["sameAs"] = litter(rec.get("sameAs"), newSameAs) changed = True else: if r.json().get("status").get("message").startswith( "the hourly limit") or r.json().get("status").get( "message").startswith("the daily limit"): eprint("Limit exceeded!\n") exit(0) if changed: return rec
def handleevent(schemaorg_attr, sourceRecord): ret = [] schemaorg_name_mapping = { "startDate": "start", "endDate": "end", "location": "locaton", "name": "name", "alternateName": "acronym", "sponsor": "sponsor", "position": "number", "affiliation": "affiliation" } if "event" in sourceRecord: eprint(sourceRecord["event"]) obj = {} for target, source in schemaorg_name_mapping.items(): if source in sourceRecord["event"]: obj[target] = sourceRecord["event"][source] if obj: ret.append(obj) return ret if ret else None
def handlefile(attribut, record): retobj = [] path = "metadata>mets:mets>mets:fileSec>mets:fileGrp" objects = getNestedJsonObject(record, path) if objects: for elem in objects: if elem.get("USE") == "DELETED": continue try: if elem.get("USE") == "DOWNLOAD" and elem.get( "mets:file") and isinstance(elem["mets:file"], dict): bnode = handlemetsfile(elem) if bnode: retobj.append(bnode) elif elem.get("mets:file") and isinstance( elem["mets:file"], list): for fd in elem["mets:file"]: #eprint(fd) if fd.get("USE") == "DOWNLOAD": eprint(fd) except AttributeError: eprint(elem) exit(-1) return (attribut, retobj) if retobj else (None, None)
def get_context(con_dict, con_url): if con_url not in con_dict: if con_url in listcontexts: r = requests.get(listcontexts[con_url]) if r.ok: con_dict[text] = r.json() eprint("got context from " + listcontexts[con_url]) else: eprint("Error, could not get context from " + con_url) exit(-1) else: r = requests.get(con_url) if r.ok: con_dict[text] = r.json() eprint("got context from " + con_url) return eprint("Error, context unknown :( " + str(con_url), doc) exit(-1)
def get_gnid_by_es(rec, host, port, index, typ): """ Use local dump in Elasticsearch """ if not any("http://www.geonames.org" in s for s in rec.get("sameAs")) and rec["geo"].get( "latitude") and rec["geo"].get("longitude"): changed = False records = [] searchbody = { "query": { "bool": { "filter": { "geo_distance": { "distance": "0.1km", "location": { "lat": float(rec["geo"].get("latitude")), "lon": float(rec["geo"].get("longitude")) } } } } } } try: for record in esgenerator(headless=True, host=host, port=port, index=index, type=typ, body=searchbody): if record.get("name") in rec.get("preferredName") or rec.get( "preferredName" ) in record.get("name") or len(records) == 1 or rec.get( "preferredName") in record.get("alternateName"): newSameAs = { '@id': "https://sws.geonames.org/" + str(record.get("id")) + "/", 'publisher': { 'abbr': "geonames", 'preferredName': "GeoNames", "isBasedOn": { "@type": "Dataset", "@id": "https://sws.geonames.org/" + str(record.get("id")) + "/" } } } rec["sameAs"] = litter(rec.get("sameAs"), newSameAs) changed = True except elasticsearch.exceptions.RequestError as e: eprint(e, json.dumps(searchbody, indent=4), json.dumps(rec, indent=4)) return if changed: return rec else: return None
def get_wpinfo(record): """ * iterates through all sameAs Links to extract a wikidata-ID * requests wikipedia sites connected to the wd-Id * enriches wikipedia sites if they are within lookup_table_wpSites (i.e. currently german, english, polish, czech) * if we get an new wikipedia link from wikidata, but we already got an old entry from other as obsolete defined sources, we delete the obsolete entry and append the new entry * enriches multilingual names if they are within lookup_table_wpSites :returns None (if record has not been changed) enriched record (dict, if record has changed) :rtype dict """ wd_uri = None wd_id = None for _id in [x["@id"] for x in record["sameAs"]]: if "wikidata" in _id: wd_uri = _id wd_id = wd_uri.split("/")[-1] break if not wd_id: return None headers = { 'User-Agent': 'efre-lod-enrich-wikipedia-bot/0.1 ' '(https://github.com/slub/esmarc) ' 'python-requests/2.22' } site_filter_param = '|'.join([x for x in lookup_table_wpSites]) wd_response = requests.get("https://www.wikidata.org/w/api.php", headers=headers, params={ 'action': 'wbgetentities', 'ids': wd_id, 'props': 'sitelinks/urls', 'format': 'json', 'sitefilter': site_filter_param }) if not wd_response.ok: eprint("wikipedia: Connection Error {status}: \'{message}\'".format( status=wd_response.status_code, message=wd_response.content)) return None # related wikipedia links: try: sites = wd_response.json()["entities"][wd_id]["sitelinks"] except KeyError: eprint("wikipedia: Data Error for Record:\n" "\'{record}\'\n\'{wp_record}\'".format( record=record, wp_record=wd_response.content)) return None # list of all abbreviations for publisher in record's sameAs abbrevs = build_abbrevs(record["sameAs"]) changed = False for wpAbbr, info in sites.items(): if wpAbbr in lookup_table_wpSites: wikip_url = info["url"] newSameAs = { "@id": wikip_url, "publisher": lookup_table_wpSites[wpAbbr], "isBasedOn": { "@type": "Dataset", "@id": wd_uri } } # wikipedia sameAs link enrichment if wpAbbr not in abbrevs: record["sameAs"].append(newSameAs) changed = True # we already got an wikipedia link for that language, but the # originating data source is obsolete, so we update elif abbrevs.get( wpAbbr) and abbrevs[wpAbbr]["host"] in obsolete_isBasedOns: record["sameAs"][abbrevs[wpAbbr]["pos"]] = newSameAs changed = True # multilingual name object enrichment if not record.get("name"): record["name"] = {} cc = wpAbbr[:2] # countrycode if cc not in record["name"]: record["name"][cc] = [info["title"]] changed = True if info["title"] not in record["name"][cc]: record["name"][cc] = litter(record["name"][cc], info["title"]) changed = True if changed: return record
def get_wpcategories(record): """ * iterates through all sameAs Links to extract the link(s) to the wiki-site * requests wikpedia categories linked to those links :returns None (if record has not been changed) enriched record (dict, if record has changed) :rtype dict """ wp_uri = None wp_title = None cc = None # countrycode changed = False retobj = {} for _id in [x["@id"] for x in record["sameAs"]]: if "wikipedia" in _id: wp_uri = _id wp_title = urllib.parse.unquote(wp_uri.split("/")[-1]) cc = wp_uri.split("/")[2].split(".")[0] headers = { 'User-Agent': 'lod-enrich-wikipedia-categories-bot/0.1' '(https://github.com/slub/esmarc) ' 'python-requests/2.22' } url = "https://{}.wikipedia.org/w/api.php".format(cc) wd_response = requests.get(url, headers=headers, params={ 'action': 'query', 'generator': 'categories', 'titles': wp_title, 'gcllimit': 500, 'prop': 'info', 'format': 'json' }) if not wd_response.ok: eprint("wikipedia-categories: Connection Error " "{status}: \'{message}\'".format( status=wd_response.status_code, message=wd_response.content)) return None # related wikipedia links: _base = "https://{}.wikipedia.org/wiki/".format(cc) try: pages = wd_response.json()["query"]["pages"] for page_id, page_data in pages.items(): _sameAs = _base + page_data["title"].replace(' ', '_') _id = _base + "?curid={}".format(page_id) # cutting off the substring 'Category:' or 'Kategorie:' from # the beginning of the title for the name field _name = ":".join(page_data["title"].split(":")[1:]) obj = {"@id": _id, "sameAs": _sameAs, "name": _name} retobj[cc] = litter(retobj.get(cc), obj) changed = True except KeyError: eprint("wikipedia-categories: Data Error for Record:\n" "{record}\'\n\'{wp_record}\'".format( record=record, wp_record=wd_response.content)) return None if changed: record["category"] = retobj return record return None
def main(): #argstuff parser = ArgumentParser( description= 'Merging of local and title marc records in MarcXchange Json format on ElasticSearch' ) parser.add_argument( '-title_host', type=str, help= 'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.' ) parser.add_argument( '-title_port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-title_type', type=str, help='ElasticSearch Type to use') parser.add_argument('-title_index', type=str, help='ElasticSearch Index to use') parser.add_argument( '-title_server', type=str, help= "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty" ) parser.add_argument( '-local_host', type=str, help= 'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.' ) parser.add_argument( '-local_port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-local_type', type=str, help='ElasticSearch Type to use') parser.add_argument('-local_index', type=str, help='ElasticSearch Index to use') parser.add_argument( '-local_server', type=str, help= "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty" ) parser.add_argument( '-selectbody', type=loads, default={"query": { "match": { "852.__.a.keyword": "DE-14" } }}) parser.add_argument('-help', action="store_true", help="print this help") args = parser.parse_args() if args.help: parser.print_help(stderr) exit() if args.title_server: slashsplit = args.title_server.split("/") args.title_host = slashsplit[2].rsplit(":")[0] if isint(args.title_server.split(":")[2].rsplit("/")[0]): args.title_port = args.title_server.split(":")[2].split("/")[0] args.title_index = args.title_server.split("/")[3] if len(slashsplit) > 4: args.local_type = slashsplit[4] if args.local_server: slashsplit = args.local_server.split("/") args.local_host = slashsplit[2].rsplit(":")[0] if isint(args.local_server.split(":")[2].rsplit("/")[0]): args.local_port = args.local_server.split(":")[2].split("/")[0] args.local_index = args.local_server.split("/")[3] if len(slashsplit) > 4: args.local_type = slashsplit[4] if args.title_server or (args.title_host and args.title_port): td = Elasticsearch([{"host": args.title_host}], port=args.title_port) else: eprint("no server for title data submitted. exiting.") exit(-1) if args.local_server or (args.local_host and args.local_port): for records in esfatgenerator(host=args.local_host, port=args.local_port, index=args.local_index, type=args.local_type, body=args.searchbody, source="852,004,938"): ids = dict() for record in records: ids[record["_source"]["004"][0]] = { "852": record["_source"]["852"], "938": record["_source"]["852"] } try: titlerecords = td.mget(index=args.title_index, doc_type=args.title_type, body={"ids": [_id for _id in ids]}) except NotFoundError: continue except RequestError: continue for record in titlerecords["docs"]: if "_source" in record: for field in ["852", "938"]: record["_source"][field] = ids[record["_id"]][field] print(dumps(record["_source"])) else: eprint(dumps(record)) else: eprint("no server for local data submitted. exiting.") exit(-1)
'@id': sameAs, 'publisher': { 'abbr': value["abbr"], 'preferredName': value["preferredName"] }, "isBasedOn": { "@type": "Dataset" } } if "slubID" in value: obj["publisher"]["@id"] = value["slubID"] if "sourceRecord" in value["isBasedOn"]: obj["isBasedOn"]["@id"] = rec["isBasedOn"] elif "entityFacts" in value["isBasedOn"]: for sameAs in sameAsses: if "d-nb.info" in sameAs: obj["isBasedOn"][ "@id"] = "http://hub.culturegraph.org/entityfacts/{}".format( sameAs.split("/")[-1]) break elif "sameAs" in value["isBasedOn"]: obj["isBasedOn"]["@id"] = obj["@id"] if isinstance(obj, dict): rec["sameAs"].append(obj) elif obj: eprint(obj) else: eprint(sameAs) rec["preferredName"] = rec.pop("name") print(json.dumps(rec))
jline["001"]=elem if elem=="0021114284" or len(elem)>512: # this particulary FINC-MARC21 Record is broken and will break the whole toolchain eprint(elem) return None return jline def valid_mrc_fields(jline): if jline: for key in jline: if isint(key) and len(str(int(key)))>1: for elem in jline[key]: if isinstance(elem,str): return None return jline if __name__ == "__main__": for line in sys.stdin: try: jline=json.loads(line) except: eprint("corrupt json: "+str(line)) continue jline=fix_mrc_id(jline) jline=valid_mrc_fields(jline) if jline: sys.stdout.write(json.dumps(jline)+"\n") sys.stdout.flush()
def main(): #argstuff parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records') parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.') parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-type',type=str,help='ElasticSearch Type to use') parser.add_argument('-index',type=str,help='ElasticSearch Index to use') parser.add_argument('-id',type=str,help='map single document, given by id') parser.add_argument('-help',action="store_true",help="print this help") parser.add_argument('-z',action="store_true",help="use gzip compression on output data") parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data') parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)') parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty") parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json") parser.add_argument('-w',type=int,default=8,help="how many processes to use") parser.add_argument('-idfile',type=str,help="path to a file with IDs to process") parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query') parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx") parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info") # parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200") args=parser.parse_args() if args.help: parser.print_help(sys.stderr) exit() if args.server: slashsplit=args.server.split("/") args.host=slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port=args.server.split(":")[2].split("/")[0] args.index=args.server.split("/")[3] if len(slashsplit)>4: args.type=slashsplit[4] if len(slashsplit)>5: if "?pretty" in args.server: args.pretty=True args.id=slashsplit[5].rsplit("?")[0] else: args.id=slashsplit[5] if args.server or ( args.host and args.port ): es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port) global base_id global target_id base_id=args.base_id_src target_id=args.target_id if args.pretty: tabbing=4 else: tabbing=None if args.host and args.index and args.type and args.id: json_record=None source=get_source_include_str() json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source) if json_record: print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing)) elif args.host and args.index and args.type and args.idfile: setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esidfilegenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query, idfile=args.idfile ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() elif args.host and args.index and args.type and args.debug: init_mp(args.host,args.port,args.prefix,args.z) for ldj in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), headless=True, body=args.query ): record = process_line(ldj,args.host,args.port,args.index,args.type) if record: for k in record: print(json.dumps(record[k],indent=None)) elif args.host and args.index and args.type : #if inf not set, than try elasticsearch setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esfatgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() else: #oh noes, no elasticsearch input-setup. then we'll use stdin eprint("No host/port/index specified, trying stdin\n") init_mp("localhost","DEBUG","DEBUG","DEBUG") with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream: for line in input_stream: ret=process_line(json.loads(line),"localhost",9200,"data","mrc") if isinstance(ret,dict): for k,v in ret.items(): print(json.dumps(v,indent=tabbing))
def entityfacts(record, ef_instances): """ Function to harvest gnd entityfacts Look for connections to other entity providers in GND's entityfacts "sameAs" field :param record: json record probably containing GND entries in their "sameAs" list field :type record: json object :param ef_instances: entityfacts-URLs instances to query :type ef_instances: list of strings :returns: :rtype: json object """ # abbreviations used by GND entityfacts and their # analoy in SLUB LOD context abbreviations = { "DNB": "https://data.slub-dresden.de/organizations/514366265", "VIAF": "https://data.slub-dresden.de/organizations/100092306", "LC": "https://data.slub-dresden.de/organizations/100822142", "DDB": "https://data.slub-dresden.de/organizations/824631854", "WIKIDATA": "https://www.wikidata.org/wiki/Q2013", "BNF": "https://data.slub-dresden.de/organizations/188898441", "KXP": "https://data.slub-dresden.de/organizations/103302212", "dewiki": None, "enwiki": None, "DE-611": "https://data.slub-dresden.de/organizations/103675612", "geonames": None, "ISNI": None, "filmportal.de": None, "ORCID": None, "Portraitindex": None, "ARCHIV-D": None, "DE-M512": None, "ADB": None, "NDB": None, "OEBL": "https://data.slub-dresden.de/organizations/102972389", "CH_HLS": None, "LAGIS": "https://data.slub-dresden.de/organizations/100482600", "WIKISOURCE": None, "DE-28": "https://data.slub-dresden.de/organizations/100874770", "OSTDEBIB": None, "PACELLI": None, "FFMPL": "https://data.slub-dresden.de/organizations/236770764", "epidat": "https://data.slub-dresden.de/organizations/103039031", "BIOKLASOZ": "https://data.slub-dresden.de/organizations/100832873", "HISTORICUMNET": "https://data.slub-dresden.de/organizations/102398704" } if not isinstance(record.get("sameAs"), list): return None gnd_id = None for item in record.get("sameAs"): if "d-nb.info" in item["@id"] and len(item["@id"].split("/")) > 4: gnd_id = item["@id"].split("/")[-1] if not gnd_id: # no GND-ID - nothing to enrich return None old_rec_sameAs_len = len(str(record["sameAs"])) for url in ef_instances: r = requests.get(url + str(gnd_id)) if r.ok: data = r.json() else: # ID not found in the respective source # just continue continue sameAsses = [] # ba-dum-ts if data.get("_source"): # in Elasticsearch: data are in the "_source" field ef_sameAs = data.get("_source").get("sameAs") else: ef_sameAs = data.get("sameAs") if not ef_sameAs or not isinstance(ef_sameAs, list): continue for sameAs in ef_sameAs: id_ = sameAs.get("@id") # we can skip DNB-link as we already have it (and # used it to come here) if not id_ or id_.startswith("https://d-nb.info"): continue obj = { '@id': id_, 'publisher': { 'abbr': sameAs["collection"]["abbr"], 'preferredName': sameAs["collection"]["name"] }, 'isBasedOn': { '@type': "Dataset", '@id': "http://hub.culturegraph.org/entityfacts/{}".format(gnd_id) } } # replace id with SLUB LOD id's listed in abbreviations if obj["publisher"]["abbr"] in abbreviations: slub_id = abbreviations[obj["publisher"]["abbr"]] if slub_id: obj["publisher"]["@id"] = slub_id else: # unknown identifier, report into error log eprint("entityfacts: Abbr. {} not known [GND-ID: {}]".format( sameAs["collection"]["abbr"], gnd_id)) sameAsses.append(obj) if sameAsses: record["sameAs"] = litter(record.get("sameAs"), sameAsses) break # compare length of transformed record, if the new entry is larger # than the old one, it was updated new_rec_sameAs_len = len(str(record["sameAs"])) if new_rec_sameAs_len > old_rec_sameAs_len: return record elif new_rec_sameAs_len < old_rec_sameAs_len: eprint("entityfacts: new record shorter than old one… " "[GND-ID: {}]".format(gnd_id)) return None else: return None