forked from cmharlow/lc-reconcile
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reconcile.py
executable file
·147 lines (135 loc) · 4.38 KB
/
reconcile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
An OpenRefine reconciliation service for the id.loc.gov LCNAF/LCSH suggest API.
"""
from flask import Flask, request, jsonify
from fuzzywuzzy import fuzz
import json
from operator import itemgetter
import rdflib
from rdflib.namespace import SKOS
import requests
from sys import version_info
import urllib
#Help text processing
import text
app = Flask(__name__)
#See if Python 3 for unicode/str use decisions
PY3 = version_info > (3,)
#If it's installed, use the requests_cache library to
#cache calls to the FAST API.
try:
import requests_cache
requests_cache.install_cache('fast_cache')
except ImportError:
app.logger.debug("No request cache found.")
pass
#Map the LoC query indexes to service types
default_query = {
"id": "/lc",
"name": "LCNAF & LCSH",
"index": "/authorities"
}
refine_to_lc = [
{
"id": "/lc/names",
"name": "Library of Congress Name Authority File",
"index": "/authorities/names"
},
{
"id": "/lc/subjects",
"name": "Library of Congress Subject Headings",
"index": "/authorities/subjects"
}
]
refine_to_lc.append(default_query)
#Make a copy of the LC mappings.
query_types = [{'id': item['id'], 'name': item['name']} for item in refine_to_lc]
# Basic service metadata.
metadata = {
"name": "LoC Reconciliation Service",
"defaultTypes": query_types,
"view": {
"url": "{{id}}"
}
}
def jsonpify(obj):
"""
Helper to support JSONP
"""
try:
callback = request.args['callback']
response = app.make_response("%s(%s)" % (callback, json.dumps(obj)))
response.mimetype = "text/javascript"
return response
except KeyError:
return jsonify(obj)
def search(raw_query, query_type='/lc'):
"""
Hit the LoC Authorities API for names.
"""
out = []
query = text.normalize(raw_query, PY3).strip()
query_type_meta = [i for i in refine_to_lc if i['id'] == query_type]
if query_type_meta == []:
query_type_meta = default_query
query_index = query_type_meta[0]['index']
try:
if PY3:
url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote(query)
else:
url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote(query)
app.logger.debug("LC Authorities API url is " + url)
resp = requests.get(url)
results = resp.json()
except getopt.GetoptError as e:
app.logger.warning(e)
return out
for n in range(0, len(results[1])):
match = False
name = results[1][n]
lc_uri = results[3][n]
#Get score for label found
score_1 = fuzz.token_sort_ratio(query, text.normalize(name, PY3))
score = score_1
# THIS IS WHERE I WILL GRAB ALTLABELS FROM URI.SKOS.NT ONCE I GET THAT PART WORKING => GIT BRANCH ALTLABEL
if score > 95:
match = True
app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + lc_uri)
resource = {
"id": lc_uri,
"name": name,
"score": score,
"match": match,
"type": query_type_meta
}
out.append(resource)
#Sort this list by score
sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
#Refine only will handle top three matches.
return sorted_out[:3]
@app.route("/", methods=['POST', 'GET'])
def reconcile():
# If a 'queries' parameter is supplied then it is a dictionary
# of (key, query) pairs representing a batch of queries. We
# should return a dictionary of (key, results) pairs.
queries = request.form.get('queries')
if queries:
queries = json.loads(queries)
results = {}
for (key, query) in queries.items():
qtype = query.get('type')
if qtype is None:
return jsonpify(metadata)
data = search(query['query'], query_type=qtype)
results[key] = {"result": data}
return jsonpify(results)
# If neither a 'query' nor 'queries' parameter is supplied then
# we should return the service metadata.
return jsonpify(metadata)
if __name__ == '__main__':
from optparse import OptionParser
oparser = OptionParser()
oparser.add_option('-d', '--debug', action='store_true', default=False)
opts, args = oparser.parse_args()
app.debug = opts.debug
app.run(host='0.0.0.0')