Exemplo n.º 1
0
 def search_api_LCS(self):
     # dictionary for storing scores for this API
     self.scores = {}
     # The API call url
     suggest = "http://id.loc.gov" + self.query_type + '/suggest/?q=' + urllib.parse.quote(
         self.name.encode('utf8'))
     try:
         # decode the JSON
         suggest_result = requests.get(suggest).json()
         # iterate over all results
         for n in range(len(suggest_result[1])):
             # get hte "name"
             candidateS = suggest_result[1][n]
             # get the URI (LC ID)
             uriS = suggest_result[3][n].replace(
                 'http://id.loc.gov/authorities/names/', '')
             self.scoreSU = fuzz.token_sort_ratio(candidateS, self.name)
             # if the similarity socre is greater than the cut-off, add the "name" LC ID and similarity score to the dict
             if self.scoreSU > self.th:
                 self.scores['LCS'] = {}
                 self.scores['LCS']["lcid"] = {}
                 self.scores['LCS']["lcid"][uriS] = [
                     candidateS, self.scoreSU
                 ]
     except:
         PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return self.scores
Exemplo n.º 2
0
 def search_api_LC(self):
     # dictionary for storing scores for this API
     self.scores = {}
     # The API call url
     dym = "http://id.loc.gov" + self.query_type + "/didyoumean/?label=" + urllib.parse.quote(
         self.name.encode('utf8'))
     try:
         dym_result = requests.get(dym)
         # get the results in form of a XML tree
         dym_results = ETree.fromstring(dym_result.content)
         for result in dym_results.iter(
                 '{http://id.loc.gov/ns/id_service#}term'):
             # get the "name"
             candidateD = result.text
             # get the URI (LC ID)
             uriD = result.get('uri')
             scoreD = fuzz.token_sort_ratio(candidateD, self.name)
             # if the similarity socre is greater than the cut-off, add the "name" LC ID and similarity score to the dict
             if scoreD > self.th:
                 self.scores['LC'] = {}
                 self.scores['LC']['lcid'] = {}
                 self.scores['LC']["lcid"][uriD] = [candidateD, scoreD]
     except:
         PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return self.scores
Exemplo n.º 3
0
 def search_api_VF(self):
     # dictionary for storing scores for this API
     self.scores = {}
     # The API call url
     viaf = "http://viaf.org/viaf/AutoSuggest?query=" + urllib.parse.quote(
         self.name.encode('utf8'))
     try:
         # decode the JSON
         viaf_result = requests.get(viaf).json()
         # if there is result in the dict, iterate over them and get the VIAF_ID and "name"
         if (viaf_result['result']):
             for item in viaf_result['result']:
                 candidateV = item['term']
                 vid = item['viafid']
                 scoreV = fuzz.token_sort_ratio(candidateV, self.name)
                 # if the similarity score is greater than the cut-off, add the VIAF_ID, "name" and similarity score to the dict
                 if scoreV > self.th:
                     self.scores['VFS'] = {}
                     self.scores['VFS']['VIAFID'] = {}
                     self.scores['VFS']['VIAFID'][vid] = [
                         candidateV, scoreV
                     ]
     except:
         PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return self.scores
Exemplo n.º 4
0
 def convert_bibframe(self):
     try:
         dumy = ''
         self.transformed = self.transform(self.doc)
     except:
         PrintException(self.log_file, dumy)
     return self.transformed
Exemplo n.º 5
0
 def mapping(self):
     # extracing names (and name_type)
     if self.type == 'name':
         try:
             # map the names in maxs dict with the original names dict (source)
             # and add the name,key,score combination to the final dict
             for i in self.maxs.keys():
                 name = i.split('-_-_-')[0]
                 type = i.split('-_-_-')[1]
                 self.final[name] = {}
                 self.final[name]['keys'] = []
                 for keys in self.source[i]['keys']:
                     self.final[name]['scores'] = self.maxs[i]
                     self.final[name]['keys'].append(keys)
         except:
             PrintException(self.log_file, name)
         return (self.final)
     elif self.type == 'title':
         try:
             for i in self.results.keys():
                 title = i.split('-_-_-')[1]
                 if 'oclcid' in self.results[i][0]['OCLC'].keys():
                     oclcid = []
                     for id in self.results[i][0]['OCLC']['oclcid'].keys():
                         oclcid.append('http://worldcat.org/oclc/' + id)
                         oclcid.append(
                             self.results[i][0]['OCLC']['oclcid'][id][1])
                 if 'work_id' in self.results[i][0]['OCLC'].keys():
                     work_id = []
                     for id in self.results[i][0]['OCLC']['work_id'].keys():
                         work_id.append(id)
                         work_id.append(
                             self.results[i][0]['OCLC']['work_id'][id][1])
                 if len(oclcid) > 0 or len(work_id) > 0:
                     ID = {}
                     if len(oclcid) > 0:
                         ID['oclcid'] = oclcid
                     if len(work_id) > 0:
                         ID['work_id'] = work_id
                     self.final[title] = {}
                     self.final[title]['keys'] = []
                     for keys in self.source[title]['keys']:
                         self.final[title]['scores'] = ID
                         self.final[title]['keys'].append(keys)
         except:
             PrintException(self.log_file, name)
         return (self.final)
Exemplo n.º 6
0
 def search_OCLC(self):
     # since this API does not need a query_type, this variable was used to pass the title to the object
     # dictionary for storing scores for this API
     self.scores = {}
     #import your OCLC develpoer key
     wskey = keys['OCLC-wskey'][0]
     # The API call url
     OCLC = "http://www.worldcat.org/webservices/catalog/search/worldcat/opensearch?q=" + self.query_type + "&wskey=%s" % (
         wskey)
     try:
         OCLC_result = requests.get(OCLC).text
         # having issues reading the response object. Work around: Write to a file and then read -- the file will be deleted at the end of the process
         with open("temp-file.xml", "w") as file:
             file.write(OCLC_result)
             file.close()
         file = ETree.parse("temp-file.xml")
         root = file.getroot()
         # iterate over the root element and get "title", "author" (name), and "id" (worldcat ID) for each entry
         for i in root.iter('{http://www.w3.org/2005/Atom}entry'):
             author = i.find(
                 '{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name'
             ).text
             title = i.find('{http://www.w3.org/2005/Atom}title').text
             id = i.find('{http://www.w3.org/2005/Atom}id').text
             # if title was a match (>%95) and author was a match also, then start the process for getting the work_id
             scoreTitle = fuzz.token_sort_ratio(title, self.query_type)
             if scoreTitle > self.th:
                 scoreOCLC = fuzz.token_sort_ratio(author, self.name)
                 if scoreOCLC > self.th:
                     # use this score as the average score
                     score = (scoreTitle + scoreOCLC) / 2
                     work_id = ''
                     # get the worldcat ID
                     wid = id.replace('http://worldcat.org/oclc/', '')
                     # store the worldcat ID in the dict -- this ID is not used for enrichment at this point
                     self.scores['OCLC'] = {}
                     self.scores['OCLC']['oclcid'] = {}
                     self.scores['OCLC']['oclcid'][wid] = [
                         self.query_type, scoreTitle
                     ]
                     # create the query url and send it to worldcat to get back the JSON-LD
                     workid = 'http://experiment.worldcat.org/oclc/' + wid + '.jsonld'
                     # decode the JSON
                     OCLC_res = requests.get(workid).json()
                     # iterate over the JSON graph and find work_id
                     for i in OCLC_res['@graph']:
                         if 'exampleOfWork' in i.keys():
                             work_id = i['exampleOfWork']
                     if work_id != '':
                         self.scores['OCLC']['work_id'] = {}
                         self.scores['OCLC']['work_id'][work_id] = [
                             self.query_type, score
                         ]
     except:
         PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return (self.scores)
Exemplo n.º 7
0
 def extract_names(self, transformed):
     self.names = {}
     self.titles = {}
     self.transformed = transformed
     try:
         count = 0
         corp_names = 0
         for i in str(self.transformed).split("\n"):
             if i != '':
                 i = i.split("\t")
                 for ind, r in enumerate(i):
                     if r == '':
                         del i[ind]
                 if len(i) > 2:
                     title = i[0]
                     title_key = i[1]
                     if title not in self.titles.keys():
                         self.titles[title] = {}
                         self.titles[title]['keys'] = []
                         self.titles[title]['authors'] = []
                         self.titles[title]['keys'].append(title_key)
                     else:
                         self.titles[title]['keys'].append(title_key)
                     n = int((len(i) - 2) / 3)
                     for index in range(0, n):
                         count += 1
                         name = i[(index * 3) + 2]
                         key = i[(index * 3) + 4]
                         type = i[(index * 3) + 3].replace(
                             'http://id.loc.gov/ontologies/bibframe/', '')
                         checksum = name + "-_-_-" + type
                         if checksum not in self.names.keys():
                             if type != 'Person':
                                 corp_names += 1
                             self.names[checksum] = {}
                             self.names[checksum]["keys"] = []
                             self.names[checksum]['keys'].append(key)
                         else:
                             self.names[checksum]['keys'].append(key)
                         if checksum not in self.titles[title]['authors']:
                             self.titles[title]['authors'].append(checksum)
     except:
         PrintException(self.log_file, name)
     return (self.names, self.titles, count, corp_names)
Exemplo n.º 8
0
 def maximizer(self):
     # a dict for storing the maximum score for each API (only for VF and LC)
     self.maxs = {}
     try:
         # iterate over the results dict
         for item in self.results.keys():
             # extract the name from the key
             name = item.split('-_-_-')[0]
             scoreLC = []
             # adding temp values as a place holder for wmpty values
             # will be removed at the end of this process
             scoreLC.append("temp")
             scoreLC.append(0)
             scoreVF = []
             scoreVF.append("temp")
             scoreVF.append(0)
             # for each name find the URI with maximun score for VOAF and LC
             for itr in self.results[item]:
                 for it in itr.keys():
                     if 'lcid' in itr[it].keys():
                         for k in itr[it]['lcid'].keys():
                             if itr[it]['lcid'][k][-1] > scoreLC[-1]:
                                 scoreLC[0] = k.replace('LC|', '').replace(
                                     ' ', '')
                                 scoreLC[1] = itr[it]['lcid'][k][-1]
                     if 'VIAFID' in itr[it].keys():
                         for k in itr[it]['VIAFID'].keys():
                             if itr[it]['VIAFID'][k][-1] > scoreVF[-1]:
                                 scoreVF[0] = k
                                 scoreVF[1] = itr[it]['VIAFID'][k][-1]
             if scoreVF[0] != "temp" or scoreLC[0] != "temp":
                 self.maxs[item] = {}
                 # removing the temp place holders
                 if scoreLC[0] != "temp":
                     self.maxs[item]['LC'] = scoreLC
                 if scoreVF[0] != "temp":
                     self.maxs[item]['VIAF'] = scoreVF
     except:
         PrintException(self.log_file, name)
     return (self.maxs)
Exemplo n.º 9
0
 def search_api_VFP(self):
     # dictionary for storing scores for this API
     self.scores = {}
     # The API call url
     viafP = "https://viaf.org/viaf/search?query=local.personalNames+all+%22" + urllib.parse.quote(
         self.name.encode('utf8')
     ) + "%22&sortKeys=holdingscount&recordSchema=BriefVIAF&httpAccept=application/json"
     try:
         # decode the JSON
         viafP_result = requests.get(viafP).json()
         # if the are records in the result dict, iterate over them and get the VIAF_ID
         if 'records' in viafP_result['searchRetrieveResponse'].keys():
             for records in viafP_result['searchRetrieveResponse'][
                     'records']:
                 viaf_id = records['record']['recordData']['viafID'][
                     '#text']
                 # there might more than one data record (in form of a list). If so, iterate over them to get the "name"
                 if isinstance(
                         records['record']['recordData']['v:mainHeadings']
                     ['data'], list):
                     for text in records['record']['recordData'][
                             'v:mainHeadings']['data']:
                         lcid = ''
                         # compare the "name" from the API to the name from the BIBFRAME. If the score is above the cut-off
                         # score then try and extract the coresponding LCID
                         candidateVP = text['text']
                         scoreVP = fuzz.token_sort_ratio(
                             candidateVP, self.name)
                         if scoreVP > self.th:
                             if isinstance(text['sources']['sid'], list):
                                 for sid in text['sources']['sid']:
                                     if "LC" in sid:
                                         lcid = sid
                             else:
                                 if "LC" in text['sources']['sid']:
                                     lcid = text['sources']['sid']
                         if lcid != '':
                             # add the VIAF_ID, the "name" from VIAF and matching score (to be used later for stats reporting)
                             self.scores['VFP'] = {}
                             self.scores['VFP']['VIAFID'] = {}
                             self.scores['VFP']['VIAFID'][viaf_id] = [
                                 candidateVP, scoreVP
                             ]
                             # add the LCID
                             self.scores['VFP']['lcid'] = {}
                             self.scores['VFP']['lcid'][lcid] = []
                             self.scores['VFP']['lcid'][lcid].append(
                                 candidateVP)
                             self.scores['VFP']['lcid'][lcid].append(
                                 scoreVP)
                 else:
                     lcid = ''
                     candidateVP = records['record']['recordData'][
                         'v:mainHeadings']['data']['text']
                     scoreVP = fuzz.token_sort_ratio(candidateVP, self.name)
                     if scoreVP > self.th:
                         if isinstance(
                                 records['record']['recordData']
                             ['v:mainHeadings']['data']['sources']['sid'],
                                 list):
                             for sid in records['record']['recordData'][
                                     'v:mainHeadings']['data']['sources'][
                                         'sid']:
                                 if "LC" in sid:
                                     lcid = sid
                         else:
                             if "LC" in records['record']['recordData'][
                                     'v:mainHeadings']['data']['sources'][
                                         'sid']:
                                 lcid = records['record']['recordData'][
                                     'v:mainHeadings']['data']['sources'][
                                         'sid']
                         if lcid != '':
                             self.scores['VFP'] = {}
                             self.scores['VFP']['VIAFID'] = {}
                             self.scores['VFP']['VIAFID'][viaf_id] = [
                                 candidateVP, scoreVP
                             ]
                             self.scores['VFP']['lcid'] = {}
                             self.scores['VFP']['lcid'][lcid] = []
                             self.scores['VFP']['lcid'][lcid].append(
                                 candidateVP)
                             self.scores['VFP']['lcid'][lcid].append(
                                 scoreVP)
     except:
         PrintException(self.log_file, self.name)
     if len(self.scores) > 0:
         return ((self.scores))
Exemplo n.º 10
0
def write(final_names, final_titles, file, output, log_file, filename):
    folder = 'results'
    if not os.path.exists(folder):
        os.makedirs(folder)
    clear_files(filename)
    print('writing ' + filename)
    #adding BIBFRAME namespaces
    enhanched = ETree.register_namespace(
        'bf', 'http://id.loc.gov/ontologies/bibframe/')
    enhanched = ETree.register_namespace('bflc',
                                         'http://id.loc.gov/ontologies/bflc/')
    enhanched = ETree.register_namespace(
        'rdfs', 'http://www.w3.org/2000/01/rdf-schema#')
    enhanched = ETree.register_namespace(
        'rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')
    enhanched = ETree.register_namespace('madsrdf',
                                         'http://www.loc.gov/mads/rdf/v1#')
    enhanched = ETree.parse(file)
    clear_TSV(filename)
    #writing extract names matching URIs into a TSV file
    tsv_file = 'results/%s/TSVs/URIs-%s.tsv' % (filename, filename)
    with open(tsv_file, "a") as tsv:
        tsv.write("ingest key" + "\t" + "viaf ID" + "\t" + "LC ID" + "\n")
        print("writing enriched names")
        for key in final_names.keys():
            name = key.split('-_-_-')[0]
            try:
                #if the key (i.e. LCID or VIAF ID) exists for a certain name, create the URI
                if "LC" in final_names[key]['scores'].keys():
                    LC = 'http://id.loc.gov/authorities/names/' + (
                        final_names[key]['scores']['LC'][0])
                if "VIAF" in final_names[key]['scores'].keys():
                    VF = 'http://viaf.org/viaf/' + (
                        final_names[key]['scores']['VIAF'][0])
                #itterate over the extracted keys (names)
                for k in final_names[key]['keys']:
                    uri_key = k
                    tsv.write(uri_key + "\t" + VF + "\t" + LC + "\n")
                    root = enhanched.getroot()
                    #search the XML for "bf:agent", if the 'example.org' key matches, insert the VF/LC URI
                    for element in root.iter(
                            '{http://id.loc.gov/ontologies/bibframe/}Agent'):
                        for ku in element.attrib.keys():
                            if element.attrib[ku] == uri_key:
                                element.set(
                                    '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about',
                                    LC)
                                #add the VF element
                                a = ETree.SubElement(element,
                                                     'bf:identifiedBy')
                                b = ETree.SubElement(a, 'bf:IdentifiedBy')
                                c = ETree.SubElement(b, 'rdf:value')
                                c.set(
                                    '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about',
                                    VF)
            except:
                print("could not find identfier for " + key)
                PrintException(log_file, name)
        print("writing enriched titles")
        #itterate over the extracted keys (titles)
        for title in final_titles.keys():
            try:
                if "oclcid" in final_titles[title]['scores'].keys():
                    OCLC_ID = final_titles[title]['scores']['oclcid'][0]
                if "work_id" in final_titles[title]['scores'].keys():
                    work_ID = final_titles[title]['scores']['work_id'][0]
                for k in final_titles[title]['keys']:
                    uri_key = k
                    tsv.write(uri_key + "\t" + OCLC_ID + "\t" + work_ID + "\n")
                    #search the XML for "bf:Work", if the 'example.org' key matches, insert the OCLC URI
                    root = enhanched.getroot()
                    for element in root.iter(
                            '{http://id.loc.gov/ontologies/bibframe/}Work'):
                        for ku in element.attrib.keys():
                            if uri_key in element.attrib[ku]:
                                element.set(
                                    '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about',
                                    work_ID)
            except:
                print("could not find identfier for " + title)
                PrintException(log_file, name)
    out = "results/%s/enhanced-files/%s" % (filename, output)
    enhanched.write(out)