def traverse(dict_or_list, path): iterator = None if isinstance(dict_or_list, dict): iterator = dict_or_list.items() elif isinstance(dict_or_list, list): iterator = enumerate(dict_or_list) elif isinstance(dict_or_list, str): strarr = [] strarr.append(dict_or_list) iterator = enumerate(strarr) else: return if iterator: for k, v in iterator: if not isinstance(v, list): if isint(k): yield path, v else: yield path + str(k), v if isinstance(v, (dict, list)) and isint(k): for k, v in traverse(v, path): yield k, v elif isinstance(v, (dict, list)): for k, v in traverse(v, path + str(k)): yield k, v
def getiso8601(date): p=re.compile(r'[\d|X].\.[\d|X].\.[\d|X]*') #test if D(D).M(M).Y(YYY) m=p.match(date) datestring="" if m: slices=list(reversed(date.split('.'))) if isint(slices[0]): datestring+=str(slices[0]) for slice in slices[1:]: if isint(slice): datestring+="-"+str(slice) else: break return datestring else: return date #was worth a try
def transpose_to_marc21(record): Mrecord=Record(force_utf8=True) Mrecord.leader=record["_LEADER"] for field in record: if isint(field): if int(field)<10: if isinstance(record[field],list): for elem in record[field]: Mrecord.add_field(Field(tag=field,data=elem)) elif isinstance(record[field],str): Mrecord.add_field(Field(tag=field,data=record[field])) else: for subfield in record[field]: for ind, values in subfield.items(): indicators=[] subfields=[] for elem in values: for k,v in elem.items(): if isinstance(v,str): subfields.append(k) subfields.append(v) elif isinstance(v,list): for subfield_elem in v: subfields.append(k) subfields.append(subfield_elem) for elem in ind: indicators.append(elem) Mrecord.add_field(Field(tag=str(field), indicators=indicators, subfields=subfields)) return Mrecord.as_marc()
def valid_mrc_fields(jline): if jline: for key in jline: if isint(key) and len(str(int(key)))>1: for elem in jline[key]: if isinstance(elem,str): return None return jline
def get_source_include_str(): items=set() items.add("079") for k,v in traverse(entities,""): #eprint(k,v) if isinstance(v,str) and isint(v[:3]) and v not in items: items.add(v[:3]) _source=",".join(items) #eprint(_source) return _source
def handlesex(record,key,entity): for v in key: marcvalue=getmarc(v,record,entity) if isinstance(marcvalue,list): marcvalue=marcvalue[0] if isint(marcvalue): marcvalue=int(marcvalue) if marcvalue==0: return "Unknown" elif marcvalue==1: return "Male" elif marcvalue==2: return "Female" elif marcvalue==9: return None
def getnumberofpages(record,regex,entity): nop=getmarc(record,regex,entity) try: if isinstance(nop,str): nop=[nop] if isinstance(nop,list): for number in nop: if "S." in number and isint(number.split('S.')[0].strip()): nop=int(number.split('S.')[0]) else: nop=None except IndexError: pass except Exception as e: with open("error.txt","a") as err: print(e,file=err) return nop
def getdateModified(record,key,entity): date=getmarc(record,key,entity) newdate="" if date: for i in range(0,13,2): if isint(date[i:i+2]): newdate+=date[i:i+2] else: newdate+="00" if i in (2,4): newdate+="-" elif i==6: newdate+="T" elif i in (8,10): newdate+=":" elif i==12: newdate+="Z" return newdate
def getisbn(record,regex,entity): isbns=getmarc(record,regex,entity) if isinstance(isbns,str): isbns=[isbns] elif isinstance(isbns,list): for i,isbn in enumerate(isbns): if "-" in isbn: isbns[i]=isbn.replace("-","") if " " in isbn: for part in isbn.rsplit(" "): if isint(part): isbns[i]=part if isbns: retarray=[] for isbn in isbns: if len(isbn)==10 or len(isbn)==13: retarray.append(isbn) return retarray
def transpose_to_ldj(record): json_record = {} json_record['_LEADER'] = record.leader json_record['_FORMAT'] = "MarcXchange" json_record['_TYPE'] = "Bibliographic" for field in record: if isint(field.tag): if field.is_control_field(): json_record[field.tag] = [field.data] else: ind = "".join(field.indicators).replace(" ", "_") ind_obj = [] for k, v in izip_longest(*[iter(field.subfields)] * 2): if "." in ind: ind = ind.replace(".", "_") if "." in k or k.isspace(): k = "_" ind_obj.append({k: v}) if not field.tag in json_record: json_record[field.tag] = [] json_record[field.tag].append({ind: ind_obj}) return json_record
def main(): #argstuff parser=argparse.ArgumentParser(description='Entitysplitting/Recognition of MARC-Records') parser.add_argument('-host',type=str,help='hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.') parser.add_argument('-port',type=int,default=9200,help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-type',type=str,help='ElasticSearch Type to use') parser.add_argument('-index',type=str,help='ElasticSearch Index to use') parser.add_argument('-id',type=str,help='map single document, given by id') parser.add_argument('-help',action="store_true",help="print this help") parser.add_argument('-z',action="store_true",help="use gzip compression on output data") parser.add_argument('-prefix',type=str,default="ldj/",help='Prefix to use for output data') parser.add_argument('-debug',action="store_true",help='Dump processed Records to stdout (mostly used for debug-purposes)') parser.add_argument('-server',type=str,help="use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty") parser.add_argument('-pretty',action="store_true",default=False,help="output tabbed json") parser.add_argument('-w',type=int,default=8,help="how many processes to use") parser.add_argument('-idfile',type=str,help="path to a file with IDs to process") parser.add_argument('-query',type=str,default={},help='prefilter the data based on an elasticsearch-query') parser.add_argument('-base_id_src',type=str,default="http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=",help="set up which base_id to use for sameAs. e.g. http://d-nb.info/gnd/xxx") parser.add_argument('-target_id',type=str,default="http://data.slub-dresden.de/",help="set up which target_id to use for @id. e.g. http://data.finc.info") # parser.add_argument('-lookup_host',type=str,help="Target or Lookup Elasticsearch-host, where the result data is going to be ingested to. Only used to lookup IDs (PPN) e.g. http://192.168.0.4:9200") args=parser.parse_args() if args.help: parser.print_help(sys.stderr) exit() if args.server: slashsplit=args.server.split("/") args.host=slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port=args.server.split(":")[2].split("/")[0] args.index=args.server.split("/")[3] if len(slashsplit)>4: args.type=slashsplit[4] if len(slashsplit)>5: if "?pretty" in args.server: args.pretty=True args.id=slashsplit[5].rsplit("?")[0] else: args.id=slashsplit[5] if args.server or ( args.host and args.port ): es=elasticsearch.Elasticsearch([{"host":args.host}],port=args.port) global base_id global target_id base_id=args.base_id_src target_id=args.target_id if args.pretty: tabbing=4 else: tabbing=None if args.host and args.index and args.type and args.id: json_record=None source=get_source_include_str() json_record=es.get_source(index=args.index,doc_type=args.type,id=args.id,_source=source) if json_record: print(json.dumps(process_line(json_record,args.host,args.port,args.index,args.type),indent=tabbing)) elif args.host and args.index and args.type and args.idfile: setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esidfilegenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query, idfile=args.idfile ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() elif args.host and args.index and args.type and args.debug: init_mp(args.host,args.port,args.prefix,args.z) for ldj in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), headless=True, body=args.query ): record = process_line(ldj,args.host,args.port,args.index,args.type) if record: for k in record: print(json.dumps(record[k],indent=None)) elif args.host and args.index and args.type : #if inf not set, than try elasticsearch setupoutput(args.prefix) pool = Pool(args.w,initializer=init_mp,initargs=(args.host,args.port,args.prefix,args.z)) for ldj in esfatgenerator(host=args.host, port=args.port, index=args.index, type=args.type, source=get_source_include_str(), body=args.query ): pool.apply_async(worker,args=(ldj,)) pool.close() pool.join() else: #oh noes, no elasticsearch input-setup. then we'll use stdin eprint("No host/port/index specified, trying stdin\n") init_mp("localhost","DEBUG","DEBUG","DEBUG") with io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') as input_stream: for line in input_stream: ret=process_line(json.loads(line),"localhost",9200,"data","mrc") if isinstance(ret,dict): for k,v in ret.items(): print(json.dumps(v,indent=tabbing))
def run(): parser = argparse.ArgumentParser(description='enrich ES by WD!') parser.add_argument( '-host', type=str, default="127.0.0.1", help= 'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.' ) parser.add_argument( '-port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-index', type=str, help='ElasticSearch Search Index to use') parser.add_argument('-type', type=str, help='ElasticSearch Search Index Type to use') parser.add_argument("-id", type=str, help="retrieve single document (optional)") parser.add_argument('-stdin', action="store_true", help="get data from stdin") parser.add_argument( '-pipeline', action="store_true", help= "output every record (even if not enriched) to put this script into a pipeline" ) # no, i don't steal the syntax from esbulk... parser.add_argument( '-server', type=str, help= "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty" ) args = parser.parse_args() if args.server: slashsplit = args.server.split("/") args.host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port = args.server.split(":")[2].split("/")[0] args.index = args.server.split("/")[3] if len(slashsplit) > 4: args.type = slashsplit[4] if len(slashsplit) > 5: if "?pretty" in args.server: args.pretty = True args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.stdin: for line in sys.stdin: rec = json.loads(line) record = None if (rec and isinstance(rec.get("sameAs"), list) and "wikidata.org" not in str(rec["sameAs"])): record = get_wdid([x["@id"] for x in rec["sameAs"]], rec) if record: rec = record if (record or args.pipeline) and rec: print(json.dumps(rec, indent=None)) else: body = { "query": { "bool": { "filter": { "bool": { "should": [], "must_not": [{ "match": { "sameAs.publisher.abbr.keyword": "WIKIDATA" } }] } } } } } for key in lookup_table_wdProperty: body["query"]["bool"]["filter"]["bool"]["should"].append( {"prefix": { "*****@*****.**": key }}) for rec in esgenerator(host=args.host, port=args.port, index=args.index, type=args.type, id=args.id, headless=True, body=body): record = None if rec.get("sameAs") and isinstance(rec.get("sameAs"), list): record = get_wdid([x["@id"] for x in rec["sameAs"]], rec) if record: rec = record if record or args.pipeline: print(json.dumps(rec, indent=None))
def run(): parser = argparse.ArgumentParser(description='enrich ES by GN!') parser.add_argument( '-host', type=str, default="127.0.0.1", help= 'hostname or IP-Address of the ElasticSearch-node to use, default is localhost.' ) parser.add_argument( '-port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-index', type=str, help='ElasticSearch Search Index to use') parser.add_argument('-type', type=str, help='ElasticSearch Search Index Type to use') parser.add_argument("-id", type=str, help="retrieve single document (optional)") parser.add_argument('-stdin', action="store_true", help="get data from stdin") parser.add_argument( '-pipeline', action="store_true", help= "output every record (even if not enriched) to put this script into a pipeline" ) # no, i don't steal the syntax from esbulk... parser.add_argument( '-server', type=str, help= "use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty" ) parser.add_argument( '-searchserver', type=str, default="http://127.0.0.1:9200/geonames/record", help= "search instance to use. default is -server e.g. http://127.0.0.1:9200" ) # index with geonames_data args = parser.parse_args() tabbing = None if args.server: slashsplit = args.server.split("/") args.host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port = args.server.split(":")[2].split("/")[0] args.index = args.server.split("/")[3] if len(slashsplit) > 4: args.type = slashsplit[4] if len(slashsplit) > 5: if "?pretty" in args.server: tabbing = 4 args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] if isint(args.searchserver.split(":")[2].rsplit("/")[0]): search_port = args.searchserver.split(":")[2].split("/")[0] search_index = args.searchserver.split("/")[3] if len(slashsplit) > 4: search_type = slashsplit[4] if args.stdin: for line in sys.stdin: rec = json.loads(line) newrec = None if rec.get("geo") and not "geonames" in str(rec["sameAs"]): newrec = get_gnid_by_es(rec, search_host, search_port, search_index, search_type) if newrec: rec = newrec if args.pipeline or newrec: print(json.dumps(rec, indent=tabbing)) else: for rec in esgenerator( host=args.host, port=args.port, index=args.index, type=args.type, headless=True, body={ "query": { "bool": { "filter": { "bool": { "must_not": [{ "prefix": { "*****@*****.**": "https://sws.geonames.org" } }, { "prefix": { "*****@*****.**": "http://sws.geonames.org" } }, { "prefix": { "*****@*****.**": "https://www.geonames.org" } }, { "prefix": { "*****@*****.**": "http://www.geonames.org" } }] } }, "must": { "exists": { "field": "geo" } } } } }): # newrec=get_gnid(rec) newrec = get_gnid_by_es(rec, search_host, search_port, search_index, search_type) if newrec: rec = newrec if args.pipeline or newrec: print(json.dumps(rec, indent=tabbing))
def run(): """ :param args: argument object, which holds the configuration :type args: argparse.Namespace :returns None :rtype None """ args = _p.parse_args() ef_instances = ["http://hub.culturegraph.org/entityfacts/"] if args.server: # overwrite args.host, args.port, args.index, [args.type] slashsplit = args.server.split("/") host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): port = args.server.split(":")[2].split("/")[0] index = args.server.split("/")[3] if len(slashsplit) > 4: type = slashsplit[4] if args.ignhub and args.searchserver: ef_instances = [] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] search_port = int(args.searchserver.split(":")[2].split("/")[0]) search_index = args.searchserver.split("/")[3] if len(slashsplit) > 4: search_type = slashsplit[4] + "/" url = "http://{h}:{p}/{i}/{t}".format(h=search_host, p=search_port, i=search_index, t=search_type) # prepend searchserver to entityfacts instances to use local # search first ef_instances = [url] + ef_instances if args.stdin: iterate = sys.stdin else: # use Elasticsearch Server for iteration es_query = { "query": { "prefix": { "*****@*****.**": "https://d-nb.info" } } } iterate = esgenerator(host=host, port=port, index=index, type=type, headless=True, body=es_query, verbose=False) for rec_in in iterate: if args.stdin: rec_in = json.loads(rec_in) rec_out = entityfacts(rec_in, ef_instances) if rec_out: print(json.dumps(rec_out, indent=None)) elif args.pipeline: print(json.dumps(rec_in, indent=None))
def main(): #argstuff parser = ArgumentParser( description= 'Merging of local and title marc records in MarcXchange Json format on ElasticSearch' ) parser.add_argument( '-title_host', type=str, help= 'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.' ) parser.add_argument( '-title_port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-title_type', type=str, help='ElasticSearch Type to use') parser.add_argument('-title_index', type=str, help='ElasticSearch Index to use') parser.add_argument( '-title_server', type=str, help= "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty" ) parser.add_argument( '-local_host', type=str, help= 'hostname or IP-Address of the ElasticSearch-node to use. If None we try to read ldj from stdin.' ) parser.add_argument( '-local_port', type=int, default=9200, help='Port of the ElasticSearch-node to use, default is 9200.') parser.add_argument('-local_type', type=str, help='ElasticSearch Type to use') parser.add_argument('-local_index', type=str, help='ElasticSearch Index to use') parser.add_argument( '-local_server', type=str, help= "use http://host:port/index/type/id?pretty syntax. overwrites host/port/index/id/pretty" ) parser.add_argument( '-selectbody', type=loads, default={"query": { "match": { "852.__.a.keyword": "DE-14" } }}) parser.add_argument('-help', action="store_true", help="print this help") args = parser.parse_args() if args.help: parser.print_help(stderr) exit() if args.title_server: slashsplit = args.title_server.split("/") args.title_host = slashsplit[2].rsplit(":")[0] if isint(args.title_server.split(":")[2].rsplit("/")[0]): args.title_port = args.title_server.split(":")[2].split("/")[0] args.title_index = args.title_server.split("/")[3] if len(slashsplit) > 4: args.local_type = slashsplit[4] if args.local_server: slashsplit = args.local_server.split("/") args.local_host = slashsplit[2].rsplit(":")[0] if isint(args.local_server.split(":")[2].rsplit("/")[0]): args.local_port = args.local_server.split(":")[2].split("/")[0] args.local_index = args.local_server.split("/")[3] if len(slashsplit) > 4: args.local_type = slashsplit[4] if args.title_server or (args.title_host and args.title_port): td = Elasticsearch([{"host": args.title_host}], port=args.title_port) else: eprint("no server for title data submitted. exiting.") exit(-1) if args.local_server or (args.local_host and args.local_port): for records in esfatgenerator(host=args.local_host, port=args.local_port, index=args.local_index, type=args.local_type, body=args.searchbody, source="852,004,938"): ids = dict() for record in records: ids[record["_source"]["004"][0]] = { "852": record["_source"]["852"], "938": record["_source"]["852"] } try: titlerecords = td.mget(index=args.title_index, doc_type=args.title_type, body={"ids": [_id for _id in ids]}) except NotFoundError: continue except RequestError: continue for record in titlerecords["docs"]: if "_source" in record: for field in ["852", "938"]: record["_source"][field] = ids[record["_id"]][field] print(dumps(record["_source"])) else: eprint(dumps(record)) else: eprint("no server for local data submitted. exiting.") exit(-1)
"use http://host:port/index/type/id?pretty. overwrites host/port/index/id/pretty" ) #no, i don't steal the syntax from esbulk... parser.add_argument( '-searchserver', type=str, help= "search instance to use. default is -server e.g. http://127.0.0.1:9200" ) parser.add_argument('-idfile', type=str, help="path to a file with IDs to process") args = parser.parse_args() if args.server: slashsplit = args.server.split("/") args.host = slashsplit[2].rsplit(":")[0] if isint(args.server.split(":")[2].rsplit("/")[0]): args.port = args.server.split(":")[2].split("/")[0] args.index = args.server.split("/")[3] if len(slashsplit) > 4: args.type = slashsplit[4] if len(slashsplit) > 5: if "?pretty" in args.server: args.pretty = True args.id = slashsplit[5].rsplit("?")[0] else: args.id = slashsplit[5] if args.searchserver: slashsplit = args.searchserver.split("/") search_host = slashsplit[2].rsplit(":")[0] if isint(args.searchserver.split(":")[2].rsplit("/")[0]): search_port = args.searchserver.split(":")[2].split("/")[0]
def test_isint(): assert es2json.isint("2") assert es2json.isint("2.5") is False assert es2json.isint(2) assert es2json.isint({"This is": "a dict"}) is False assert es2json.isint(["this", "is", "a", "list"]) is False