-
Notifications
You must be signed in to change notification settings - Fork 0
/
learner.py
260 lines (238 loc) · 8.86 KB
/
learner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import requests
import json
from py2neo import neo4j, cypher
import os
from util import Util
from reldatastore import RelDataStore
from fastdatastore import FastDataStore
from constants import *
from urlparse import urlparse
from math import *
from extractor import Extractor
from parser import Parser
class Learner(Util):
def __init__(self):
'''
initializes:
1. graph database connection
2. datastore connection
3. graph database indices required
4. url and templates for interaction with the graph database REST API
'''
Util.__init__(self)
if os.environ.get('NEO4J_URL'):
graph_db_url = urlparse(os.environ.get('NEO4J_URL'))
neo4j.authenticate(
"{host}:{port}".format(host = graph_db_url.hostname, port = graph_db_url.port),
graph_db_url.username, graph_db_url.password
)
self.graphdb = neo4j.GraphDatabaseService(
'http://{host}:{port}/db/data'.format(host = graph_db_url.hostname, port = graph_db_url.port)
)
else:
self.graphdb = neo4j.GraphDatabaseService()
self.node_index = self.graphdb.get_or_create_index(neo4j.Node, 'NODE')
self.disambiguation_index = self.graphdb.get_or_create_index(neo4j.Node, self.DISAMBIGUATION)
self._url = lambda present_node: 'http://localhost:7474/db/data/node/{0}'.format(present_node)
self._template = lambda target_node: {
"to" : self._url(target_node),
"relationships": {
"type": "sibling"
},
"cost_property": "weight",
"algorithm" : "dijkstra"
}
self.DataM = RelDataStore()
def get_all_meanings(self, word):
'''
Get all meanings as assigned by the disambiguation index (should be very fast! approx O(1) hopefully)
If that fails, get all meanings given by the following regex: <word>*
If even that fails, get all meanings fuzzily equal to <word> using Levenshtein distance or soundex
If even THAT fails, return an error saying no meanings and ask the user what the hell he meant to say
word => string keyword to get all possible neo4j objects for
'''
print "WORD:", word
data, metadata = cypher.execute(self.graphdb, 'start n=node:%s(name="%s") match n-[]-m return m' % (self.DISAMBIGUATION, word))
if data:
print data
return [d[0] for d in data]
res = self.disambiguation_index.query("name:%s~" % word)
if res:
print res
return [d[0] for d in data]
res = self.disambiguation_index.query("name:%s*" % word)
if res:
print res
return [d[0] for d in data]
data, metadata = cypher.execute(self.graphdb, 'START root=node(*) WHERE root.name=~".*%s.*" RETURN root' % word)
if data:
print data
return [d[0] for d in data]
return []
def get_syn_root(self, word):
''' return the root node corresponding to the synonym word '''
print word
word = word.upper().replace(' ', '_')
fdb = FastDataStore()
word_id = fdb.get(SYN + word)
if not word_id: return None
else: return fdb.get(ROOT + word_id)
def disambiguate(self, precise, vague, history = None, return_format = neo4j.Node):
'''
disambiguate will try to disambiguate the keywords in vague based on the following information in order
1. the values in precise
2. if that fails, the values already present in uid's interests in the datastore
3. if even that fails, random choice
precise => list of neo4j.Node objects
vague => list of string objects to be converted into neo4j.Node objects
'''
getnode = lambda x: self.graphdb.get_indexed_node('NODE', 'name', x)
new_vague = []
resolved = []
for word in vague:
#try trivial synsets
root = self.get_syn_root(word)
if root:
resolved.append(getnode(root))
continue
root = getnode(word)
if root:
resolved.append(root)
continue
new_vague.append(word)
if not new_vague:
if return_format == str and resolved:
resolved = [x['name'] for x in resolved]
return resolved
#first make list of precise node objects (by default already there)
precise_nodes = []
for p in precise:
if type(p) == neo4j.Node:
precise_nodes.append(p)
else:
n = getnode(p)
if not n: new_vague.append(p)
else: precise_nodes.append(n)
#then start the steps of disambiguation
for word in new_vague:
#try all possible meanings via disambiguation index
meaning_nodes = self.get_all_meanings(word)
if meaning_nodes and precise_nodes:
print "CHECKING CONTEXT"
#a disambiguation node exists for this word
scores = [{'node': meaning_node, 'dist': self.graph_distance([meaning_node], precise_nodes)} for meaning_node in meaning_nodes]
print [(x['node']['name'], x['dist']) for x in scores]
min_dist = min(scores, key = lambda x: x['dist'])['dist']
minlist = [{'node': x['node'], 'deg': self.get_degree(x['node'])} for x in scores if x['dist'] == min_dist]
node = max(minlist, key = lambda x: x['deg'])['node']
print "resolved: ", node['name']
resolved.append(node)
continue
if history:
print "CHECKING HISTORY"
history_nodes = [getnode(x['interest']) for x in history]
#a disambiguation node exists for this word
scores = [{'node': meaning_node, 'dist': self.graph_distance([meaning_node], history_nodes)} for meaning_node in meaning_nodes]
min_dist = min(scores, key = lambda x: x['dist'])['dist']
minlist = [{'node': x['node'], 'deg': self.get_degree(x['node'])} for x in scores if x['dist'] == min_dist]
node = max(minlist, key = lambda x: x['deg'])['node']
resolved.append(node)
continue
if meaning_nodes:
resolved.append(meaning_nodes[0])
if return_format == str and resolved:
resolved = [x['name'] for x in resolved]
print "resolved:", resolved
return resolved
def get_degree(self, node):
data, metadata = cypher.execute(self.graphdb, 'start n=node:%s(name="%s") match n-[]-m return count(m)' % ('NODE', node['name']))
return data[0][0]
def get_shortest_path(self, n1, n2):
'''least cost path from n1 to n2. Type of n1, n2 = neo4j.Node'''
res = requests.post(self._url(n1._id) + '/path', data = json.dumps(self._template(n2._id)))
res_json = res.json()
if res.status_code == requests.codes.ok:
#print res_json
return res_json['length'] + 1
else:
return self.INF
def graph_distance(self, target_nodes, present_nodes):
'''
target_nodes is a vector (lists) of node objects whose cumulative score is to be determined
present_nodes is a vector of <Node Object>s
'''
#target_nodes = [self.graphdb.get_indexed_node('NODE', 'name', x) for x in target]
#present_nodes = [self.graphdb.get_indexed_node('NODE', 'name', x) for x in present]
score_1 = 0
score_2 = 0
for tnode in target_nodes:
l = self.INF
for pnode in present_nodes:
dist = self.get_shortest_path(tnode, pnode)
if l > dist: l = dist
if l != self.INF: score_1 += l
score_1 /= float(len(target_nodes))
for pnode in present_nodes:
l = self.INF
for tnode in target_nodes:
dist = self.get_shortest_path(tnode, pnode)
if l > dist: l = dist
if l != self.INF: score_2 += l
score_2 /= float(len(present_nodes))
return (score_1 + score_2) / float(2)
def score_all(self, key, uid, kwlist):
'''
score all keywords together
'''
ret, length = self.DataM.get_interests_for_user_for_key(key, uid, interest_types = (SUPPLIED, GENERATED))
print ret
getnode = lambda x: self.graphdb.get_indexed_node('NODE', 'name', x)
precise = []
vague = []
#precise => list of (neo4j.Node, interest_level) objects
#vague => list of (string, interest_level) objects to be converted into a neo4j.Node
print kwlist
for kw in kwlist:
n = self.graphdb.get_indexed_node(self.DISAMBIGUATION, 'name', kw)
if n:
vague.append(kw)
else:
m = getnode(kw)
if m:
precise.append(m)
else:
vague.append(kw)
print "vague: ", vague
print "precise: ", precise
if vague:
precise = self.disambiguate(precise, vague, ret)
#precise now contains all nodes corresponding to the given keywords
print "FINAL PRECISE:", precise
s = self.graph_distance(precise, [getnode(x['interest']) for x in ret])
return 1 / float(s)
def get_related(self, word, limit = 10, return_format = str):
'''returns top <limit> nodes most closely related to <word>'''
data, metadata = cypher.execute(self.graphdb, 'start n=node:NODE(name="%s") match n-[r:sibling]-m return m, r.weight order by r.weight desc limit %d' % (word, limit))
if data:
if return_format == str:
return sorted([(self._encode_str(d[0]['name']), d[1]) for d in data])
elif return_format == neo4j.Node:
return sorted([(d[0], d[1]) for d in data], key = lambda x: x[0])
else:
return None
def getSemanticDistance(self, a, b):
na = self.graphdb.get_indexed_node('NODE', 'name', a)
nb = self.graphdb.get_indexed_node('NODE', 'name', b)
if na and nb:
return self.get_shortest_path(na, nb)
else:
return -1
def extract_keyphrases(self, data, type = "text"):
p = Parser()
if type == "text":
return list(p.parseText(data)[0])
elif type == "url":
return list(p.parseURLText(data)[0])
def getURLText(self, url):
p = Parser()
return p.getURLText(url)