-
Notifications
You must be signed in to change notification settings - Fork 1
/
rank_dbpedia_properties.py
501 lines (393 loc) · 15.3 KB
/
rank_dbpedia_properties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
from rdflib import *
from SPARQLWrapper import *
import sys
import json
import re
import shlex
import subprocess
import urllib2
original_out = sys.stdout
prop_val_count = {}
ans_dict = {}
parameter_list = ['value', 'total_values', 'frequency', 'blacklisted', 'is_onto', 'special_char',
'no_of_words', 'has_range', 'has_comment', 'value_relevant',
'google_keypress', 'google_location', 'special_datatype',
'is_of_relation', 'score', 'label']
def get_label(prop):
'''
Function to covert camelCase into distinct words.
Example: string "wikiPageID" is returned as a string "wiki Page ID" using RegEx.
'''
prop = prop.split('/')[-1]
prop = re.sub(
r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', prop)
return prop
def get_resource_type(results):
'''
Computes the resource type of the resource.
Example:For resource =' Iron_Maiden', returns resource_type = 'http://dbpedia.org/ontology/Organisation'
'''
resource_type = None
for result in results["results"]["bindings"]:
prop = result["prop"]["value"]
value = result["value"]["value"]
if "#type" in prop:
if "dbpedia.org/ontology" in value:
resource_type = value
if resource_type is None:
return "http://www.w3.org/2002/07/owl#Thing"
else:
return resource_type
def blacklisted(prop):
'''
Function to check whether the particular prop is blacklisted,
by checking if it has "wiki" or "image" or "same As" is in its name or the length is less than three,
making the prop relevant
'''
if " id" in prop or "PrimaryTopicOf" in prop or "wiki" in prop or "image" in prop or "same As" in prop or "Photo" in prop or len(prop) < 3:
return 1
else:
return 0
def is_onto(prop):
'''
Function checks if the prop type is ontology or not.
'''
if "ontology" in prop:
return 1
else:
return 0
def doesnt_contain_special_chars(prop):
'''
Function which returns 0 if prop has chars other than letters and space.
'''
pattern = re.compile("^[a-zA-Z\s]*$")
if pattern.match(prop):
return 1
else:
return 0
def no_of_words(prop):
'''
Function to count number of words in the prop name.
Returns the inverse of (count of spaces encountered plus 1).
'''
return 1.0 / float(prop.count(' ') + 1) # returns inverse of no of words
def prop_has_range_or_comment(prop_value):
'''
Function to check whether the property has 'range' or 'comment' attribute associated with it.
'''
# return (0, 0)
has_comment = 0
has_range = 0
prop = prop_value['prop']
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
query1 = """
select distinct ?prop ?value where {
<""" + prop + """> ?prop ?value }
"""
sparql.setQuery(query1)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
prop = result["prop"]["value"]
if "range" in prop:
has_range = 1
if "comment" in prop:
has_comment = 1
return (has_range, has_comment)
def value_relevant(prop_value):
'''
Function which analyzes the value for files or images making the property less relevant.
'''
value = prop_value['value']
if "File" in value or "wordnet" in value or ".ogg" in value or ".svg" in value or ".jpg" in value or ".png" in value:
return 0
else:
return 1
def google_autocomplete_ranker(resource, prop):
'''
Function which counts the number of characters from prop name needed to be pressed so that the prop is
suggested by Google auto-complete pertaining to the current resource.
Returns total key-presses required and the location at which its suggested.
'''
# return (0, 0)
resource = resource.replace(" ", "_")
prop = prop.lower()
google_keypresses = 0
success = 0
suggest_location = 1
while (google_keypresses <= len(prop) and google_keypresses <= 4):
temp_string = prop[0:google_keypresses]
suggest_location = 1
# base_url to request
request_url = 'http://google.com/complete/search?client=chrome&q='
request_string = request_url + resource + '_' + \
temp_string.replace(" ", "_") # build complete query url
content = urllib2.urlopen(request_string).read()
content = unicode(content, errors='replace') # handle utf-8 errors
content_json = json.loads(content) # save response as list
for key, val in enumerate(content_json[1]):
suggest_location += 1
if prop in val:
success = 1
break
if success == 1:
break
google_keypresses += 1
if success == 0:
suggest_location = 0
else:
suggest_location = 1.0 / suggest_location
return (1.0 / (google_keypresses + 1), suggest_location)
def handle_is_of_relations(resource, resource_type, total_pages):
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
query1 = """
select ?prop ?value where {
?value ?prop <http://dbpedia.org/resource/""" + resource + """> }
"""
sparql.setQuery(query1)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
prop = result["prop"]["value"]
value = result["value"]["value"]
cleaned_property_label = get_label(prop)
if blacklisted(cleaned_property_label):
continue
if "ontology" not in prop and "property" not in prop and "subject" not in prop:
continue
if prop in ans_dict:
ans_dict[prop].setdefault('value', []).append(value)
prop_val_count[prop] += 1
ans_dict[prop]['is_of_relation'] = 1
continue
prop_info = dict.fromkeys(parameter_list, 0)
prop_info['score'] = 0
prop_info['value'] = []
if "xml:lang" not in result["value"] or 'en' in result["value"]["xml:lang"]:
prop_value = {}
prop_value['prop'] = prop
prop_value['value'] = value
if prop in prop_val_count:
prop_val_count[prop] += 1
else:
prop_val_count[prop] = 1
cleaned_property_label = get_label(prop)
prop_info['label'] = cleaned_property_label
prop_info.setdefault('value', []).append(value)
prop_info['blacklisted'] = 0
'''
if prop_info['blacklisted']:
ans_dict[prop] = prop_info
continue
'''
google_autosuggest = google_autocomplete_ranker(
resource, cleaned_property_label)
prop_info['is_onto'] = is_onto(prop)
prop_info['special_char'] = doesnt_contain_special_chars(
cleaned_property_label)
prop_info['no_of_words'] = no_of_words(cleaned_property_label)
range_comment = prop_has_range_or_comment(prop_value)
prop_info['has_range'] = range_comment[0]
prop_info['has_comment'] = range_comment[1]
prop_info['value_relevant'] = value_relevant(prop_value)
prop_info['special_datatype'] = is_special_datatype(result)
prop_info['google_keypress'] = google_autosuggest[0]
prop_info['google_location'] = google_autosuggest[1]
prop_info['is_of_relation'] = 1
prop_info['frequency'] = count_freq(
resource_type, prop) / float(total_pages)
ans_dict[prop] = prop_info
def is_special_datatype(result):
'''
Function to check whether value has a special data type like date, etc.
TODO: Determine more special relevant data types.
'''
if "datatype" in result['value']:
value_datatype = result['value']['datatype']
if 'date' in value_datatype:
return 1
else:
return 0
return 0
def count_freq(resource_type, prop):
'''
Function which counts how many times has the property appeared w.r.t. the resource type.
'''
freq = 0
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
query1 = """
SELECT COUNT(DISTINCT ?entity)
WHERE { ?entity <""" + prop + """> ?value.
?entity <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <""" + resource_type + """> }
"""
sparql.setQuery(query1)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
freq = int(result["callret-0"]["value"])
return freq
def total_pages_for_type(resource_type):
'''
Function which returns the total number of resources belonging to given resource type.
'''
freq = 0
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
query1 = """
SELECT COUNT(DISTINCT ?entity)
WHERE { ?entity <http://dbpedia.org/ontology/wikiPageID> ?value.
?entity <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <""" + resource_type + """> }
"""
sparql.setQuery(query1)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
freq = int(result["callret-0"]["value"])
return freq
normalized_labels = []
def start(resource):
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
query1 = """
select distinct ?prop ?value
where {
<http://dbpedia.org/resource/""" + resource + """> ?prop ?value }
"""
sparql.setQuery(query1)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
resource_type = get_resource_type(results)
total_pages = total_pages_for_type(resource_type)
for result in results["results"]["bindings"]:
prop = result["prop"]["value"]
value = result["value"]["value"]
cleaned_property_label = get_label(prop)
if "ontology" not in prop and "property" not in prop and "subject" not in prop:
continue
prop_info = dict.fromkeys(parameter_list, 0)
prop_info['score'] = 0
prop_info['value'] = []
if "xml:lang" not in result["value"] or 'en' in result["value"]["xml:lang"]:
prop_value = {}
prop_value['prop'] = prop
prop_value['value'] = value
if prop in prop_val_count:
prop_val_count[prop] += 1
else:
normalized_label = cleaned_property_label.lower().replace(
' ', '')
if normalized_label in normalized_labels:
continue
normalized_labels.append(normalized_label)
prop_val_count[prop] = 1
if prop in ans_dict:
ans_dict[prop].setdefault('value', []).append(value)
continue
prop_info['label'] = cleaned_property_label
prop_info.setdefault('value', []).append(value)
prop_info['blacklisted'] = 0
'''
if prop_info['blacklisted']:
ans_dict[prop] = prop_info
continue
'''
google_autosuggest = google_autocomplete_ranker(
resource, cleaned_property_label)
prop_info['is_onto'] = is_onto(prop)
prop_info['special_char'] = doesnt_contain_special_chars(
cleaned_property_label)
prop_info['no_of_words'] = no_of_words(cleaned_property_label)
range_comment = prop_has_range_or_comment(prop_value)
prop_info['has_range'] = range_comment[0]
prop_info['has_comment'] = range_comment[1]
prop_info['value_relevant'] = value_relevant(prop_value)
prop_info['special_datatype'] = is_special_datatype(result)
prop_info['google_keypress'] = google_autosuggest[0]
prop_info['google_location'] = google_autosuggest[1]
prop_info['is_of_relation'] = 0
prop_info['frequency'] = count_freq(
resource_type, prop) / float(total_pages)
ans_dict[prop] = prop_info
#handle_is_of_relations(resource, resource_type, total_pages)
for prop, count in prop_val_count.iteritems():
ans_dict[prop]['total_values'] = (1.0 - 1.0 / count)
#score = raw_input("Enter score for: " + prop + " (from 1-5) \n")
ans_dict[prop]['score'] = 0
def rank_properties(resource):
prop_val_count.clear()
ans_dict.clear()
del normalized_labels[:]
res = resource
start(res)
open_file = res + "_test.txt"
sys.stdout = open(open_file, 'w')
'''
jsonfile = json.dumps(ans_dict).encode('utf-8')
infoFromJson = json.loads(jsonfile)
print json2html.convert(json=infoFromJson).encode('utf-8')
end = datetime.datetime.now()
print ("Total Time taken: " + str(end - stt))
'''
for prop, count in prop_val_count.iteritems():
print (str(ans_dict[prop]['score']) + " qid:1" + " 1:" + str(ans_dict[prop]['frequency']) +
" 2:" + str(ans_dict[prop]['is_of_relation']) +
" 3:" + str(ans_dict[prop]['is_onto']) + " 4:" + str(ans_dict[prop]['has_range']) +
" 5:" + str(ans_dict[prop]['has_comment']) + " 6:" + str(ans_dict[prop]['total_values']) +
" 7:" + str(ans_dict[prop]['google_keypress']) + " 8:" + str(ans_dict[prop]['google_location']) +
" 9:" + str(ans_dict[prop]['value_relevant']) + " 10:" + str(ans_dict[prop]['blacklisted']) +
" 11:" + str(ans_dict[prop]['special_char']) + " 12:" + str(ans_dict[prop]['no_of_words']) +
" 13:" + str(ans_dict[prop]['special_datatype']) + " # " + prop)
sys.stdout = original_out
ans_list = []
del ans_list[:]
prop_file = res + '_test.txt'
score_file = res + '_score.txt'
cmd = "java -jar RankLib-2.1-patched.jar -load new_model.txt -rank \"" + \
prop_file + "\" -score \"" + score_file + "\""
args = shlex.split(cmd)
p = subprocess.call(args)
# time.sleep(10)
with open(prop_file) as f:
lines = f.readlines()
for line in lines:
prop = line.split('# ')[-1]
prop = prop[0:len(prop) - 1]
content = [0, prop]
ans_list.append(content)
# print prop
with open(score_file) as f:
lines = f.readlines()
i = 0
for line in lines:
score = re.findall(r"[-+]?\d*\.\d+|\d+", line)[-1]
ans_list[i][0] = score
i += 1
ans_list.sort(cmp=None, key=None, reverse=True)
tot_val = len(ans_list)
tot = '"total": "' + str(tot_val) + '", '
ans = '{' + tot + ' "error": "0" , "resources": ['
res = ""
i = 1
for x in ans_list:
val_string = ""
for v in ans_dict[x[1]]['value']:
v = v.replace('\"', '')
val_string += """{ "value": \"""" + v + """\"},"""
val_string = val_string[0:-1]
res += """
{
"rank": \"""" + str(i) + """\",
"property": \"""" + str(x[1]) + """\",
"score": \"""" + str(x[0]) + """\",
"label": \"""" + str(ans_dict[x[1]]['label']) + """\",
"is_of": \"""" + str(ans_dict[x[1]]['is_of_relation']) + """\",
"values": [ """ + val_string + """ ]
},"""
i += 1
ans += res
ans = ans[0:-1]
ans += ']}'
ans = ans.replace('\\', '')
# print ans
json_obj = json.loads(ans, strict=False)
ans = json.dumps(json_obj, indent=4)
return ans
print rank_properties("Facebook")