def get_gnid(rec):
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" +
                         rec["geo"].get("latitude") + "&lng=" +
                         rec["geo"].get("longitude") + "&username=slublod")
        if r.ok and isiter(r.json().get("geonames")):
            for geoNameRecord in r.json().get("geonames"):
                if rec.get("name") in geoNameRecord.get(
                        "name") or geoNameRecord.get("name") in rec.get(
                            "name"):  #match!
                    rec["sameAs"] = litter(
                        rec.get("sameAs"), "http://www.geonames.org/" +
                        str(geoNameRecord.get("geonameId")) + "/")
                    changed = True
        else:
            if r.json().get("status").get("message").startswith(
                    "the hourly limit") or r.json().get("status").get(
                        "message").startswith("the daily limit"):
                eprint("Limit exceeded!\n")
                exit(0)
        if changed:
            return rec
Exemplo n.º 2
0
def get_wdid(_ids, rec):
    """
    gets an list of sameAs Links, e.g. ['https://d-nb.info/gnd/118827545', 'http://swb.bsz-bw.de/DB=2.1/PPNSET?PPN=035143010', 'http://catalogue.bnf.fr/ark:/12148/cb119027159', 'http://id.loc.gov/rwo/agents/n50002729', 'http://isni.org/isni/0000000120960218', 'http://viaf.org/viaf/44298691']
    """
    if not isinstance(_ids, list):
        return None
    changed = False
    url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
    # Define header according to wikidata's User-Agent policy
    # see: https://meta.wikimedia.org/wiki/User-Agent_policy
    headers = {
        'User-Agent':
        'efre-lod-enrich-wikidata-bot/0.1 '
        '(https://github.com/slub/esmarc) '
        'python-requests/2.22'
    }

    or_mapping = []
    for _id in _ids:
        for key, value in lookup_table_wdProperty.items():
            if _id.startswith(key):
                or_mapping.append("?item wdt:{Property} \"{value}\"".format(
                    Property=value["property"],
                    value=_id.split(value["delim"])[-1]))
                break

    if or_mapping:
        # BUILD an SPARQL OR Query with an UNION Operator.
        # Still builds an normal query without UNION when or_mapping List only contains one element
        query = '''SELECT DISTINCT ?item \nWHERE {{\n\t{{ {UNION} }}\n}}'''.format(
            UNION="} UNION\n\t\t {".join(or_mapping))
        data = requests.get(url,
                            headers=headers,
                            params={
                                'query': query,
                                'format': 'json'
                            })
        if data.ok and len(data.json().get("results").get("bindings")) > 0:
            for item in data.json().get("results").get("bindings"):
                rec["sameAs"] = litter(
                    rec["sameAs"], {
                        "@id": item.get("item").get("value"),
                        "publisher": {
                            "@id": "https://www.wikidata.org/wiki/Q2013",
                            "abbr": "WIKIDATA",
                            "preferredName": "Wikidata"
                        },
                        "isBasedOn": {
                            "@type": "Dataset",
                            "@id": item.get("item").get("value")
                        }
                    })
                changed = True
        elif not data.ok:
            eprint("wikidata: Connection Error {status}: \'{message}\'".format(
                status=data.status_code, message=data.content))
    if changed:
        return rec
Exemplo n.º 3
0
def get_subfields(jline,key,entity):
    data=[]
    if isinstance(key,list):
        for k in key:
           data=litter(data,get_subfield(jline,k,entity))
        return ArrayOrSingleValue(data)
    elif isinstance(key,str):
        return get_subfield(jline,key,entity)
    else:
        return
Exemplo n.º 4
0
def get_subfield_if_4(jline,key,entity):
    #e.g. split "551^4:orta" to 551 and orta
    marcfield=key.rsplit("^")[0]
    subfield4=key.rsplit("^")[1]
    data=[]
    if marcfield in jline:
        for array in jline[marcfield]:
            for k,v in array.items():
                sset={}
                for subfield in v:
                    for subfield_code in subfield:
                        sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code])
                if sset.get("4") and subfield4 in sset.get("4"):
                    newrecord=copy.deepcopy(jline)
                    for i,subtype in enumerate(newrecord[marcfield]):
                        for elem in subtype.get("__"):
                            if elem.get("4") and subfield4!=elem["4"]:
                                del newrecord[marcfield][i]["__"]
                    data=litter(get_subfields(newrecord,marcfield,entity),data)
    if data:
        return ArrayOrSingleValue(data)
def handleaboutelem(attribut, obj):
    retobj = []
    if obj.get("authority").lower() == "rvk" and obj.get("_"):
        for rvk in obj["_"].split(","):
            retobj = litter(
                retobj, {
                    "@id":
                    "https://rvk.uni-regensburg.de/api/json/ancestors/" +
                    rvk.replace(" ", "%20").strip(),
                    "identifier": {
                        "@type": "PropertyValue",
                        "propertyID": "RVK",
                        "@value": rvk.strip()
                    }
                })
    elif obj.get("authority").lower() == "ddc" and obj.get("_"):
        for ddc in obj.get("_").split(","):
            retobj = litter(
                retobj, {
                    "identifier": {
                        "@type": "PropertyValue",
                        "propertyID": "DDC",
                        "@value": ddc.strip()
                    },
                    "@id":
                    "http://purl.org/NET/decimalised#c" + ddc.strip()[:3]
                })
    elif obj.get("authority").lower() == "z" and obj.get("_"):
        newObj = {"@value": obj.get("_"), "@type": "schema:Text"}
        if obj.get("lang"):
            newObj["@language"] = obj["lang"]
        retobj = litter(retobj, newObj)

    elif obj.get("authority").lower() == "sswd" and obj.get("_"):
        retobj = litter(retobj, {
            "@value": obj.get("_"),
            "@type": "schema:Text",
            "@language": "ger"
        })
    return retobj if retobj else None
Exemplo n.º 6
0
def test_litter():
    assert es2json.litter("foo", "bar") == ["foo", "bar"]
    assert es2json.litter(["foo", "bar"], "baz") == ["foo", "bar", "baz"]
    assert es2json.litter("baz", ["foo", "bar"]) == ["baz", "foo", "bar"]
    assert es2json.litter(None, ["foo", "bar", "baz"]) == ["foo", "bar", "baz"]
    assert es2json.litter(["foo", "foobar"],
                          ["bar", "baz"]) == ["foo", "foobar", "bar", "baz"]
    assert es2json.litter(["foo", "foobar", "bar"],
                          ["bar", "baz"]) == ["foo", "foobar", "bar", "baz"]
def get_gnid_by_es(rec, host, port, index, typ):
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        records = []
        searchbody = {
            "query": {
                "bool": {
                    "filter": {
                        "geo_distance": {
                            "distance": "0.1km",
                            "location": {
                                "lat": float(rec["geo"].get("latitude")),
                                "lon": float(rec["geo"].get("longitude"))
                            }
                        }
                    }
                }
            }
        }
        try:
            for record in esgenerator(headless=True,
                                      host=host,
                                      port=port,
                                      index=index,
                                      type=typ,
                                      body=searchbody):
                records.append(record)
        except elasticsearch.exceptions.RequestError as e:
            eprint(e, json.dumps(searchbody, indent=4),
                   json.dumps(rec, indent=4))
            return

        if records:
            for record in records:
                if record.get("name") in rec.get("name") or rec.get(
                        "name") in record.get("name") or len(
                            records) == 1 or rec.get("name") in record.get(
                                "alternateName"):
                    #eprint(rec.get("name"),record.get("name"),record.get("id"),record.get("location"))
                    rec["sameAs"] = litter(
                        rec.get("sameAs"), "http://www.geonames.org/" +
                        str(record.get("id")) + "/")
                    changed = True
        if changed:
            return rec
        else:
            return None
Exemplo n.º 8
0
def getmarc(record,regex,entity):
    if "+" in regex:
        marcfield=regex[:3]
        if marcfield in record:
            subfields=regex.split(".")[-1].split("+")
            data=None
            for array in record.get(marcfield):
                for k,v in array.items():
                    sset={}
                    for subfield in v:
                        for subfield_code in subfield:
                            sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code])
                    fullstr=""
                    for sf in subfields:
                        if sf in sset:
                            if fullstr:
                                fullstr+=". "
                            if isinstance(sset[sf],str):
                                fullstr+=sset[sf]
                            elif isinstance(sset[sf],list):
                                fullstr+=". ".join(sset[sf])
                    if fullstr:
                        data=litter(data,fullstr)
            if data:
                return ArrayOrSingleValue(data)
    else:
        ret=[]
        if isinstance(regex,str):
            regex=[regex]
        for string in regex:
            if string[:3] in record:
                ret=litter(ret,ArrayOrSingleValue(list(getmarcvalues(record,string,entity))))
        if ret:
            if isinstance(ret,list):    #simple deduplizierung via uniq() 
                ret = list(uniq(ret))
            return ArrayOrSingleValue(ret)
Exemplo n.º 9
0
def get_gnid(rec):
    """
    Use geonames API (slow and quota limit for free accounts)
    """
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        r = requests.get("http://api.geonames.org/findNearbyJSON?lat=" +
                         rec["geo"].get("latitude") + "&lng=" +
                         rec["geo"].get("longitude") + "&username=slublod")
        if r.ok and isiter(r.json().get("geonames")):
            for geoNameRecord in r.json().get("geonames"):
                if rec.get("name") in geoNameRecord.get(
                        "name") or geoNameRecord.get("name") in rec.get(
                            "name"):  # match!
                    newSameAs = {
                        '@id':
                        "https://sws.geonames.org/" +
                        str(geoNameRecord.get("geonameId")) + "/",
                        'publisher': {
                            'abbr': "geonames",
                            'preferredName': "GeoNames",
                            "isBasedOn": {
                                "@type":
                                "Dataset",
                                "@id":
                                "https://sws.geonames.org/" +
                                str(record.get("id")) + "/"
                            }
                        }
                    }
                    rec["sameAs"] = litter(rec.get("sameAs"), newSameAs)
                    changed = True
        else:
            if r.json().get("status").get("message").startswith(
                    "the hourly limit") or r.json().get("status").get(
                        "message").startswith("the daily limit"):
                eprint("Limit exceeded!\n")
                exit(0)
        if changed:
            return rec
def map_record(sourceRecord):
    record = {}
    for target_value, source_value in mapping.items():
        if callable(source_value):
            key, value = source_value(target_value, sourceRecord)
            if value:
                if key == "__array__":
                    for elem in value:
                        if isinstance(elem, dict):
                            for subkey, item in elem.items():
                                record[subkey] = litter(
                                    record.get(subkey), item)
                else:
                    record[key] = value
            elif isinstance(source_value,
                            str) and source_value in sourceRecord:
                record[target_value] = sourceRecord[source_value]
    if record:
        record["@context"] = "http://schema.org"
    return record
Exemplo n.º 11
0
def marc_dates(record,event):
    recset={}
    if record:
        for indicator_level in record:
            for subfield in indicator_level:
                sset={}
                for sf_elem in indicator_level.get(subfield):
                    for k,v in sf_elem.items():
                        if k=="a" or k=="4":
                            sset[k]=litter(sset.get(k),v)
                if isinstance(sset.get("4"),str):
                    sset["4"]=[sset.get("4")]
                if isinstance(sset.get("4"),list):
                    for elem in sset.get("4"):
                        if elem.startswith("dat"):
                            recset[elem]=sset.get("a")
    if recset.get("datx"):
        return dateToEvent(recset["datx"],event)
    elif recset.get("datl"):
        return dateToEvent(recset["datl"],event)
    else:
        return None
Exemplo n.º 12
0
def entityfacts(record,gnd,ef_instances):
    try:
        changed = False
        for url in ef_instances:
            r = get(url+str(gnd))
            if r.ok:
                data=r.json()
                if data.get("_source"):
                    data=data.get("_source")
                sameAsses=[] # ba-dum-ts
                if data.get("sameAs") and isinstance(data["sameAs"],list):
                    for sameAs in data.get("sameAs"):
                        if sameAs.get("@id"):
                            if not sameAs.get("@id").startswith("http://d-nb.info"):
                                sameAsses.append(sameAs.get("@id"))
                #print(sameAsses,url)
                if sameAsses:
                    record["sameAs"]=litter(record.get("sameAs"),sameAsses)
                    changed=True
                break
        return record if changed else None
    except Exception as e:
        time.sleep(5)
        return entityfacts(record,gnd,ef_instances)
Exemplo n.º 13
0
def entityfacts(record, ef_instances):
    """ Function to harvest gnd entityfacts
    Look for connections to other entity providers in GND's
    entityfacts "sameAs" field


    :param record: json record probably containing GND entries
                   in their "sameAs" list field
    :type  record: json object

    :param ef_instances: entityfacts-URLs instances to query
    :type  ef_instances: list of strings

    :returns:
    :rtype:   json object
    """
    # abbreviations used by GND entityfacts and their
    # analoy in SLUB LOD context
    abbreviations = {
        "DNB": "https://data.slub-dresden.de/organizations/514366265",
        "VIAF": "https://data.slub-dresden.de/organizations/100092306",
        "LC": "https://data.slub-dresden.de/organizations/100822142",
        "DDB": "https://data.slub-dresden.de/organizations/824631854",
        "WIKIDATA": "https://www.wikidata.org/wiki/Q2013",
        "BNF": "https://data.slub-dresden.de/organizations/188898441",
        "KXP": "https://data.slub-dresden.de/organizations/103302212",
        "dewiki": None,
        "enwiki": None,
        "DE-611": "https://data.slub-dresden.de/organizations/103675612",
        "geonames": None,
        "ISNI": None,
        "filmportal.de": None,
        "ORCID": None,
        "Portraitindex": None,
        "ARCHIV-D": None,
        "DE-M512": None,
        "ADB": None,
        "NDB": None,
        "OEBL": "https://data.slub-dresden.de/organizations/102972389",
        "CH_HLS": None,
        "LAGIS": "https://data.slub-dresden.de/organizations/100482600",
        "WIKISOURCE": None,
        "DE-28": "https://data.slub-dresden.de/organizations/100874770",
        "OSTDEBIB": None,
        "PACELLI": None,
        "FFMPL": "https://data.slub-dresden.de/organizations/236770764",
        "epidat": "https://data.slub-dresden.de/organizations/103039031",
        "BIOKLASOZ": "https://data.slub-dresden.de/organizations/100832873",
        "HISTORICUMNET": "https://data.slub-dresden.de/organizations/102398704"
    }

    if not isinstance(record.get("sameAs"), list):
        return None

    gnd_id = None
    for item in record.get("sameAs"):
        if "d-nb.info" in item["@id"] and len(item["@id"].split("/")) > 4:
            gnd_id = item["@id"].split("/")[-1]

    if not gnd_id:
        # no GND-ID - nothing to enrich
        return None

    old_rec_sameAs_len = len(str(record["sameAs"]))
    for url in ef_instances:
        r = requests.get(url + str(gnd_id))
        if r.ok:
            data = r.json()
        else:
            # ID not found in the respective source
            # just continue
            continue

        sameAsses = []  # ba-dum-ts

        if data.get("_source"):
            # in Elasticsearch: data are in the "_source" field
            ef_sameAs = data.get("_source").get("sameAs")
        else:
            ef_sameAs = data.get("sameAs")

        if not ef_sameAs or not isinstance(ef_sameAs, list):
            continue

        for sameAs in ef_sameAs:
            id_ = sameAs.get("@id")

            # we can skip DNB-link as we already have it (and
            # used it to come here)
            if not id_ or id_.startswith("https://d-nb.info"):
                continue

            obj = {
                '@id': id_,
                'publisher': {
                    'abbr': sameAs["collection"]["abbr"],
                    'preferredName': sameAs["collection"]["name"]
                },
                'isBasedOn': {
                    '@type':
                    "Dataset",
                    '@id':
                    "http://hub.culturegraph.org/entityfacts/{}".format(gnd_id)
                }
            }
            # replace id with SLUB LOD id's listed in abbreviations
            if obj["publisher"]["abbr"] in abbreviations:
                slub_id = abbreviations[obj["publisher"]["abbr"]]
                if slub_id:
                    obj["publisher"]["@id"] = slub_id
            else:
                # unknown identifier, report into error log
                eprint("entityfacts: Abbr. {} not known [GND-ID: {}]".format(
                    sameAs["collection"]["abbr"], gnd_id))
            sameAsses.append(obj)

        if sameAsses:
            record["sameAs"] = litter(record.get("sameAs"), sameAsses)
        break

    # compare length of transformed record, if the new entry is larger
    # than the old one, it was updated
    new_rec_sameAs_len = len(str(record["sameAs"]))
    if new_rec_sameAs_len > old_rec_sameAs_len:
        return record
    elif new_rec_sameAs_len < old_rec_sameAs_len:
        eprint("entityfacts: new record shorter than old one… "
               "[GND-ID: {}]".format(gnd_id))
        return None
    else:
        return None
Exemplo n.º 14
0
def get_subfield(jline,key,entity):
    keymap={"100":"persons",
            "700":"persons",
            "500":"persons",
            "711":"events",
            "110":"organizations",
            "710":"organizations",
            "551":"geo",
            "689":"topics",
            "550":"topics",
            "551":"geo",
            "655":"topics",
            "830":"resources",
            }
    entityType=keymap.get(key)
    data=[]
    if key in jline:
        for array in jline[key]:
            for k,v in array.items():
                sset={}
                for subfield in v:
                    for subfield_code in subfield:
                        sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code])
                node={}
                for typ in ["D","d"]:
                    if sset.get(typ):   #http://www.dnb.de/SharedDocs/Downloads/DE/DNB/wir/marc21VereinbarungDatentauschTeil1.pdf?__blob=publicationFile Seite 14
                        node["@type"]="http://schema.org/"
                        if sset.get(typ)=="p":
                            node["@type"]+="Person"
                            entityType="persons"
                        elif sset.get(typ)=="b":
                            node["@type"]+="Organization"
                            entityType="organizations"
                        elif sset.get(typ)=="f":
                            node["@type"]+="Event"
                            entityType="events"
                        elif sset.get(typ)=="u":
                            node["@type"]+="CreativeWork"
                        elif sset.get(typ)=="g":
                            node["@type"]+="Place"
                        else:
                            node.pop("@type")
                if entityType=="resources" and sset.get("w") and not sset.get("0"):
                    sset["0"]=sset.get("w")
                if sset.get("0"):
                    if isinstance(sset["0"],list) and entityType=="persons":
                        for n,elem in enumerate(sset["0"]):
                            if elem and "DE-576" in elem:
                                sset["0"].pop(n)
                    uri=gnd2uri(sset.get("0"))
                    if isinstance(uri,str) and uri.startswith(base_id) and not entityType=="resources":
                        node["@id"]=id2uri(uri,entityType)
                    elif isinstance(uri,str) and uri.startswith(base_id) and entityType=="resources":
                        node["sameAs"]=base_id+id2uri(uri,entityType).split("/")[-1]
                    elif isinstance(uri,str) and uri.startswith("http") and not uri.startswith(base_id):
                        node["sameAs"]=uri
                    elif isinstance(uri,str):
                        node["identifier"]=uri
                    elif isinstance(uri,list):
                        node["sameAs"]=None
                        node["identifier"]=None
                        for elem in uri:
                            if isinstance(elem,str) and elem.startswith(base_id):
                                #if key=="830":  #Dirty Workaround for finc id
                                    #rsplit=elem.rsplit("=")
                                    #rsplit[-1]="0-"+rsplit[-1]
                                    #elem='='.join(rsplit)
                                node["@id"]=id2uri(elem,entityType)
                            elif isinstance(elem,str) and elem.startswith("http") and not elem.startswith(base_id):
                                node["sameAs"]=litter(node["sameAs"],elem)
                            elif elem:
                                node["identifier"]=litter(node["identifier"],elem)
                if isinstance(sset.get("a"),str) and len(sset.get("a"))>1:
                    node["name"]=sset.get("a")
                elif isinstance(sset.get("a"),list):
                    for elem in sset.get("a"):
                        if len(elem)>1:
                            node["name"]=litter(node.get("name"),elem)
                            
                if sset.get("v") and entityType=="resources":
                    node["position"]=sset["v"]
                if sset.get("i"):
                    node["description"]=sset["i"]
                if sset.get("n") and entityType=="events":
                    node["position"]=sset["n"]
                for typ in ["D","d"]:
                    if sset.get(typ):   #http://www.dnb.de/SharedDocs/Downloads/DE/DNB/wir/marc21VereinbarungDatentauschTeil1.pdf?__blob=publicationFile Seite 14
                        node["@type"]="http://schema.org/"
                        if sset.get(typ)=="p":
                            node["@type"]+="Person"
                        elif sset.get(typ)=="b":
                            node["@type"]+="Organization"
                        elif sset.get(typ)=="f":
                            node["@type"]+="Event"
                        elif sset.get(typ)=="u":
                            node["@type"]+="CreativeWork"
                        elif sset.get(typ)=="g":
                            node["@type"]+="Place"
                        else:
                            node.pop("@type")
                            
                if node:
                    data=litter(data,node)
                    #data.append(node)
        if data:
            return  ArrayOrSingleValue(data)
Exemplo n.º 15
0
def get_wpcategories(record):
    """
    * iterates through all sameAs Links to extract the
      link(s) to the wiki-site
    * requests wikpedia categories linked to those links
    :returns None (if record has not been changed)
             enriched record (dict, if record has changed)
    :rtype dict
    """
    wp_uri = None
    wp_title = None
    cc = None  # countrycode
    changed = False
    retobj = {}
    for _id in [x["@id"] for x in record["sameAs"]]:
        if "wikipedia" in _id:
            wp_uri = _id
            wp_title = urllib.parse.unquote(wp_uri.split("/")[-1])
            cc = wp_uri.split("/")[2].split(".")[0]

            headers = {
                'User-Agent':
                'lod-enrich-wikipedia-categories-bot/0.1'
                '(https://github.com/slub/esmarc) '
                'python-requests/2.22'
            }
            url = "https://{}.wikipedia.org/w/api.php".format(cc)
            wd_response = requests.get(url,
                                       headers=headers,
                                       params={
                                           'action': 'query',
                                           'generator': 'categories',
                                           'titles': wp_title,
                                           'gcllimit': 500,
                                           'prop': 'info',
                                           'format': 'json'
                                       })
            if not wd_response.ok:
                eprint("wikipedia-categories: Connection Error "
                       "{status}: \'{message}\'".format(
                           status=wd_response.status_code,
                           message=wd_response.content))
                return None
            # related wikipedia links:
            _base = "https://{}.wikipedia.org/wiki/".format(cc)
            try:
                pages = wd_response.json()["query"]["pages"]
                for page_id, page_data in pages.items():
                    _sameAs = _base + page_data["title"].replace(' ', '_')
                    _id = _base + "?curid={}".format(page_id)
                    # cutting off the substring 'Category:' or 'Kategorie:' from
                    # the beginning of the title for the name field
                    _name = ":".join(page_data["title"].split(":")[1:])
                    obj = {"@id": _id, "sameAs": _sameAs, "name": _name}
                    retobj[cc] = litter(retobj.get(cc), obj)
                    changed = True
            except KeyError:
                eprint("wikipedia-categories: Data Error for Record:\n"
                       "{record}\'\n\'{wp_record}\'".format(
                           record=record, wp_record=wd_response.content))
                return None
    if changed:
        record["category"] = retobj
        return record
    return None
Exemplo n.º 16
0
def process(record, gnd, server):
    change = False  # [0]   [1] [2]         [3]   [4,-1]
    # http: / / d-nb.info / gnd / 102859268X get the GND number
    record_url = "{}/gnd-records/record/{}".format(server, gnd)
    r = requests.get(record_url)
    if r.ok:
        for gndItem in map:
            if r.json().get("_source").get(gndItem):
                for elem in r.json().get("_source").get(gndItem):
                    value = elem
                    if isinstance(elem, str):
                        elem = {"id": elem}
                    if isinstance(elem, dict):
                        if "id" in elem:
                            newvalue = elem.get("id").split("/")[-1]
                            value = elem.get("id")
                        else:
                            continue
                    elif isinstance(elem, list):
                        continue
                    newabout = {
                        "identifier": {
                            "propertyID": gndItem,
                            "@type": "PropertyValue",
                            "value": newvalue
                        }
                    }
                    if value.startswith("http"):
                        newabout["@id"] = value
                    if gndItem == "fieldOfStudy":
                        fos = requests.get(server + "/gnd-records/record/" +
                                           newvalue)
                        if fos.ok and fos.json().get("_source").get(
                                "relatedDdcWithDegreeOfDeterminacy3"):
                            newabout["identifier"] = [
                                newabout.pop("identifier")
                            ]
                            ddcs = fos.json().get("_source").get(
                                "relatedDdcWithDegreeOfDeterminacy3")
                            if isinstance(ddcs, dict):
                                ddcs = [ddcs]
                            if isinstance(ddcs, list):
                                for ddc in ddcs:
                                    if isinstance(ddc, str):
                                        ddc = {"id": ddc}
                                    newabout["identifier"].append({
                                        "@type":
                                        "PropertyValue",
                                        "propertyID":
                                        "DDC",
                                        "value":
                                        ddc.get("id").split("/")[-2][:3]
                                    })
                                    newabout["@id"] = ddc.get("id")
                            if fos.json().get("_source").get(
                                    "preferredNameForTheSubjectHeading"):
                                newabout["name"] = fos.json().get(
                                    "_source").get(
                                        "preferredNameForTheSubjectHeading")
                    elif gndItem == "gndSubjectCategory":
                        url = server + "/gnd-subjects/subject/_search"
                        gsc = requests.post(
                            url,
                            json={"query": {
                                "match": {
                                    "@id.keyword": value
                                }
                            }})
                        if gsc.ok and gsc.json().get("hits").get("total") == 1:
                            for hit in gsc.json().get("hits").get("hits"):
                                newabout["name"] = " ".join(
                                    hit.get("_source").get("skos:prefLabel").
                                    get("@value").replace("\n", "").split())
                    if not record.get("about"):
                        record["about"] = newabout
                        change = True
                    else:
                        plzAdd = True
                        if isinstance(record.get("about"),
                                      dict) and record.get("about").get(
                                          "@id") and value not in record.get(
                                              "about").get("@id"):
                            record["about"] = [record.pop("about")]
                        elif isinstance(record.get("about"), list):
                            for item in record.get("about"):
                                if item.get("@id") and value in item.get(
                                        "@id"):
                                    plzAdd = False
                                elif isinstance(item.get("identifier"), list):
                                    for ident_list_elem in item.get(
                                            "identifier"):
                                        if ident_list_elem.get(
                                                "@id"
                                        ) and value in ident_list_elem.get(
                                                "@id"):
                                            plzAdd = False
                        if plzAdd:
                            change = True
                            record["about"] = litter(record["about"], newabout)
    return record if change else None
Exemplo n.º 17
0
def relatedTo(jline,key,entity):
    #e.g. split "551^4:orta" to 551 and orta
    marcfield=key[:3]
    data=[]
    if marcfield in jline:
        for array in jline[marcfield]:
            for k,v in array.items():
                sset={}
                for subfield in v:
                    for subfield_code in subfield:
                        sset[subfield_code]=litter(sset.get(subfield_code),subfield[subfield_code])
                if isinstance(sset.get("9"),str) and sset.get("9") in marc2relation:
                    node={}
                    node["_key"]=marc2relation[sset["9"]]
                    if sset.get("0"):
                        uri=gnd2uri(sset.get("0"))
                        if isinstance(uri,str) and uri.startswith(base_id):
                            node["@id"]=id2uri(sset.get("0"),"persons")
                        elif isinstance(uri,str) and uri.startswith("http") and not uri.startswith(base_id):
                            node["sameAs"]=uri
                        elif isinstance(uri,str):
                            node["identifier"]=sset.get("0")
                        elif isinstance(uri,list):
                            node["sameAs"]=None
                            node["identifier"]=None
                            for elem in uri:
                                if elem and isinstance(elem,str) and elem.startswith(base_id):
                                    node["@id"]=id2uri(elem.split("=")[-1],"persons")
                                elif elem and isinstance(elem,str) and elem.startswith("http") and not elem.startswith(base_id):
                                    node["sameAs"]=litter(node["sameAs"],elem)
                                else:
                                    node["identifier"]=litter(node["identifier"],elem)
                    if sset.get("a"):
                        node["name"]=sset.get("a")
                    data.append(node)
                elif isinstance(sset.get("9"),list):
                    node={}
                    for elem in sset["9"]:
                        if elem.startswith("v"):
                            for k,v in marc2relation.items():
                                if k.lower() in elem.lower():
                                    node["_key"]=v
                                    break
                        elif [x for x in marc2relation if x.lower() in elem.lower()]:
                            for x in marc2relation:
                                if x.lower() in elem.lower():
                                    node["_key"]=marc2relation[x]
                        elif not node.get("_key"):
                            node["_key"]="relatedTo"
                        #eprint(elem,node)
                    if sset.get("0"):
                        uri=gnd2uri(sset.get("0"))
                        if isinstance(uri,str) and uri.startswith(base_id):
                            node["@id"]=id2uri(sset.get("0"),"persons")
                        elif isinstance(uri,str) and uri.startswith("http") and not uri.startswith(base_id):
                            node["sameAs"]=uri
                        elif isinstance(uri,str):
                            node["identifier"]=uri
                        elif isinstance(uri,list):
                            node["sameAs"]=None
                            node["identifier"]=None
                            for elem in uri:
                                if elem and elem.startswith(base_id):
                                    node["@id"]=id2uri(elem.split("=")[-1],"persons")
                                elif elem and elem.startswith("http") and not elem.startswith(base_id):
                                    node["sameAs"]=litter(node["sameAs"],elem)
                                elif elem:
                                    node["identifier"]=litter(node["identifier"],elem)
                    if sset.get("a"):
                        node["name"]=sset.get("a")
                    data.append(node)
                    #eprint(node)
                    
        if data:
            return ArrayOrSingleValue(data)
def process(record, dnb_uri, server):
    change = False  #   [0]   [1] [2]         [3]   [4,-1]
    r = requests.get(
        server + "/gnd-records/record/" + str(dnb_uri.split("/")[-1])
    )  #	http: / / d-nb.info / gnd / 102859268X get the GND number
    if r.ok:
        for gndItem in map:
            if r.json().get("_source").get(gndItem):
                for elem in r.json().get("_source").get(gndItem):
                    newabout = {
                        "identifier": {
                            "propertyID": gndItem,
                            "@type": "PropertyValue",
                            "value": elem.split("/")[-1]
                        }
                    }
                    if elem.startswith("http"):
                        newabout["@id"] = elem
                    if gndItem == "fieldOfStudy":
                        fos = requests.get(server + "/gnd-records/record/" +
                                           elem.split("/")[-1])
                        if fos.ok and fos.json().get("_source").get(
                                "relatedDdcWithDegreeOfDeterminacy3"):
                            newabout["identifier"] = [
                                newabout.pop("identifier")
                            ]
                            newabout["identifier"].append({
                                "@type":
                                "PropertyValue",
                                "propertyID":
                                "DDC",
                                "value":
                                fos.json().get("_source").get(
                                    "relatedDdcWithDegreeOfDeterminacy3")
                                [0].split("/")[-2][:3]
                            })
                            if fos.json().get("_source").get(
                                    "preferredNameForTheSubjectHeading"):
                                newabout["name"] = fos.json().get(
                                    "_source").get(
                                        "preferredNameForTheSubjectHeading")[0]
                            newabout[
                                "@id"] = "http://purl.org/NET/decimalised#c" + fos.json(
                                ).get("_source").get(
                                    "relatedDdcWithDegreeOfDeterminacy3"
                                )[0].split("/")[-2][:3]
                    elif gndItem == "gndSubjectCategory":
                        url = server + "/gnd-subjects/subject/_search"
                        gsc = requests.post(
                            url,
                            json={"query": {
                                "match": {
                                    "@id.keyword": elem
                                }
                            }})
                        if gsc.ok and gsc.json().get("hits").get("total") == 1:
                            for hit in gsc.json().get("hits").get("hits"):
                                newabout["name"] = " ".join(
                                    hit.get("_source").get("skos:prefLabel").
                                    get("@value").replace("\n", "").split())
                    if not record.get("about"):
                        record["about"] = newabout
                        change = True
                    else:
                        plzAdd = True
                        #print(elem,record.get("about"))
                        if isinstance(record.get("about"),
                                      dict) and record.get("about").get(
                                          "@id") and elem not in record.get(
                                              "about").get("@id"):
                            record["about"] = [record.pop("about")]
                        elif isinstance(record.get("about"), list):
                            for item in record.get("about"):
                                if item.get("@id") and elem in item.get("@id"):
                                    plzAdd = False
                                elif isinstance(item.get("identifier"), list):
                                    for ident_list_elem in item.get(
                                            "identifier"):
                                        if ident_list_elem.get(
                                                "@id"
                                        ) and elem in ident_list_elem.get(
                                                "@id"):
                                            plzAdd = False
                        if plzAdd:
                            change = True
                            record["about"] = litter(record["about"], newabout)
    return record if change else None
Exemplo n.º 19
0
def get_gnid_by_es(rec, host, port, index, typ):
    """
    Use local dump in Elasticsearch
    """
    if not any("http://www.geonames.org" in s
               for s in rec.get("sameAs")) and rec["geo"].get(
                   "latitude") and rec["geo"].get("longitude"):
        changed = False
        records = []
        searchbody = {
            "query": {
                "bool": {
                    "filter": {
                        "geo_distance": {
                            "distance": "0.1km",
                            "location": {
                                "lat": float(rec["geo"].get("latitude")),
                                "lon": float(rec["geo"].get("longitude"))
                            }
                        }
                    }
                }
            }
        }
        try:
            for record in esgenerator(headless=True,
                                      host=host,
                                      port=port,
                                      index=index,
                                      type=typ,
                                      body=searchbody):
                if record.get("name") in rec.get("preferredName") or rec.get(
                        "preferredName"
                ) in record.get("name") or len(records) == 1 or rec.get(
                        "preferredName") in record.get("alternateName"):
                    newSameAs = {
                        '@id':
                        "https://sws.geonames.org/" + str(record.get("id")) +
                        "/",
                        'publisher': {
                            'abbr': "geonames",
                            'preferredName': "GeoNames",
                            "isBasedOn": {
                                "@type":
                                "Dataset",
                                "@id":
                                "https://sws.geonames.org/" +
                                str(record.get("id")) + "/"
                            }
                        }
                    }
                    rec["sameAs"] = litter(rec.get("sameAs"), newSameAs)
                    changed = True
        except elasticsearch.exceptions.RequestError as e:
            eprint(e, json.dumps(searchbody, indent=4),
                   json.dumps(rec, indent=4))
            return

        if changed:
            return rec
        else:
            return None
def handleRoles(obj, rec):
    retobj = {}
    key = ""
    #eprint(json.dumps(ob))
    role = ""
    kv_table = {
        "given":
        "givenName",
        "family":
        "familyName",
        "termsOfAddress":
        "honorificPrefix",
        'ID':
        '@id',
        'namePart':
        'name',
        "date":
        "birthDate",
        "aut":
        "author",
        "edt":
        "contributor",
        "pbl":
        "publisher",
        "dgg":
        "sourceOrganisation",
        "prv":
        "provider",
        "rev":
        "contributor",
        "dgs":
        "contributor",
        "ctb":
        "contributor",
        "oth":
        "contributor",
        "red":
        "contributor",
        "ill":
        "illustrator",
        "fnd":
        "funder",
        "cmp":
        "composer",
        "ths":
        "instructor",
        "sad":
        "contributor",
        "trl":
        "translator",
        "art":
        "artist",
        "Den akademischen Grad verleihende / prüfende Institution":
        "sourceOrganisation",
        'Den akademischen Grad verleihende Institution':
        "sourceOrganisation",
        #'Medizinische Fakultät':"sourceOrganisation",
        #'Universität Leipzig':"sourceOrganisation",
        'Den akademischen Grad verleihende/prüfende Institution':
        "sourceOrganisation",
    }
    if isinstance(obj.get("role"), dict):
        for k, v in traverse(obj["role"], ""):
            if k == "roleTerm" and v.get("_"):
                if v["_"].strip(
                ) in kv_table:  # using strip() to avoid this nonesense: e.g.: '_': '\n                \n                prv\n              '
                    role = kv_table[v["_"].strip()]
                else:
                    pass
                    #_bla,_blubb = getUrn("None",rec)
                    #eprint(_blubb)
    if role:
        retobj[role] = {}
        for key in obj:
            if key in kv_table and isinstance(obj[key], str):
                retobj[role][kv_table[key]] = obj[key]
            if key in kv_table and isinstance(obj[key], list):
                for elem in obj[key]:
                    if elem.get("type") in kv_table:
                        retobj[role][kv_table[elem["type"]]] = elem.get("_")
                    else:
                        pass
        if isinstance(obj.get("nameIdentifier"), list):
            for elem in obj["nameIdentifier"]:
                retobj[role]["sameAs"] = litter(retobj.get("sameAs"),
                                                handleIdentifiers(elem))
        elif isinstance(obj.get("nameIdentifier"), dict):
            retobj[role]["sameAs"] = litter(
                retobj.get("sameAs"),
                handleIdentifiers(obj.get("nameIdentifier")))
        if retobj[role].get("givenName") and retobj[role].get("familyName"):
            retobj[role]["name"] = retobj[role].pop(
                "familyName") + ", " + retobj[role].pop("givenName")
        #if obj.get("role") and obj["role"].get("roleTerm") and isinstance(obj["role"]["roleTerm"],dict) and obj["role"]["roleTerm"]["_"]=="aut":
        #key=="author"
        #elif obj.get("role") and obj["role"].get("roleTerm") and isinstance(obj["role"]["roleTerm"],dict) and obj["role"]["roleTerm"]["_"]=="prv":
        #key=="provider"
        #eprint(retobj)
        return retobj
Exemplo n.º 21
0
def get_wpinfo(record):
    """
    * iterates through all sameAs Links to extract a wikidata-ID
    * requests wikipedia sites connected to the wd-Id
    * enriches wikipedia sites if they are within lookup_table_wpSites
      (i.e. currently german, english, polish, czech)
    * if we get an new wikipedia link from wikidata, but we
      already got an old entry from other as obsolete defined sources,
      we delete the obsolete entry and append the new entry
    * enriches multilingual names if they are within lookup_table_wpSites

    :returns None (if record has not been changed)
             enriched record (dict, if record has changed)
    :rtype dict
    """
    wd_uri = None
    wd_id = None

    for _id in [x["@id"] for x in record["sameAs"]]:
        if "wikidata" in _id:
            wd_uri = _id
            wd_id = wd_uri.split("/")[-1]
            break
    if not wd_id:
        return None

    headers = {
        'User-Agent':
        'efre-lod-enrich-wikipedia-bot/0.1 '
        '(https://github.com/slub/esmarc) '
        'python-requests/2.22'
    }
    site_filter_param = '|'.join([x for x in lookup_table_wpSites])
    wd_response = requests.get("https://www.wikidata.org/w/api.php",
                               headers=headers,
                               params={
                                   'action': 'wbgetentities',
                                   'ids': wd_id,
                                   'props': 'sitelinks/urls',
                                   'format': 'json',
                                   'sitefilter': site_filter_param
                               })

    if not wd_response.ok:
        eprint("wikipedia: Connection Error {status}: \'{message}\'".format(
            status=wd_response.status_code, message=wd_response.content))
        return None

    # related wikipedia links:
    try:
        sites = wd_response.json()["entities"][wd_id]["sitelinks"]
    except KeyError:
        eprint("wikipedia: Data Error for Record:\n"
               "\'{record}\'\n\'{wp_record}\'".format(
                   record=record, wp_record=wd_response.content))
        return None

    # list of all abbreviations for publisher in record's sameAs
    abbrevs = build_abbrevs(record["sameAs"])
    changed = False
    for wpAbbr, info in sites.items():
        if wpAbbr in lookup_table_wpSites:
            wikip_url = info["url"]
            newSameAs = {
                "@id": wikip_url,
                "publisher": lookup_table_wpSites[wpAbbr],
                "isBasedOn": {
                    "@type": "Dataset",
                    "@id": wd_uri
                }
            }
            # wikipedia sameAs link enrichment
            if wpAbbr not in abbrevs:
                record["sameAs"].append(newSameAs)
                changed = True

            # we already got an wikipedia link for that language, but the
            # originating data source is obsolete, so we update
            elif abbrevs.get(
                    wpAbbr) and abbrevs[wpAbbr]["host"] in obsolete_isBasedOns:
                record["sameAs"][abbrevs[wpAbbr]["pos"]] = newSameAs
                changed = True

            # multilingual name object enrichment
            if not record.get("name"):
                record["name"] = {}
            cc = wpAbbr[:2]  # countrycode
            if cc not in record["name"]:
                record["name"][cc] = [info["title"]]
                changed = True
            if info["title"] not in record["name"][cc]:
                record["name"][cc] = litter(record["name"][cc], info["title"])
                changed = True
    if changed:
        return record