-
Notifications
You must be signed in to change notification settings - Fork 0
/
query_ontology.py
91 lines (81 loc) · 4.69 KB
/
query_ontology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
from nltk.tokenize import word_tokenize
import SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import OrderedDict
import pandas
from langdetect import detect
from py2neo import Graph
def tokenize_topics(topics):
topics = re.sub("([0-9]*\.[0-9]*)\*", "", topics)
topics = re.sub('[0-9]\, u|\"|\'|\+|\,|\\[|\\(|\\]|\\)', "", topics)
tokens_list = word_tokenize(topics)
return list(set(tokens_list))
def query_wikidata_service(searchterm, language_code):
query = """SELECT ?item ?itemLabel ?subclass_of ?subclass_ofLabel ?category_of ?category_ofLabel
?instance_of ?instance_ofLabel WHERE { SERVICE wikibase:mwapi { bd:serviceParam wikibase:api "EntitySearch" .
bd:serviceParam wikibase:endpoint "www.wikidata.org" . bd:serviceParam mwapi:search '""" + searchterm + """' .
bd:serviceParam mwapi:language '""" + language_code + """' . bd:serviceParam wikibase:limit 1 .
?item wikibase:apiOutputItem mwapi:item .} SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
OPTIONAL { ?item (wdt:P279) ?subclass_of.} OPTIONAL { ?item (wdt:P910) ?category_of.}
OPTIONAL { ?item (wdt:P31) ?instance_of.}}"""
url = 'https://query.wikidata.org/sparql'
sparql = SPARQLWrapper(url)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
return sparql.query().convert()
def add_and_link_ontology_nodes(df, skill):
cypher_list = []
keys = df.keys()
for key in keys:
switch_case = lookup_dict.get(key)
for b in range(len(df)):
cypher_list.append("""MERGE (s:Skill {name: '""" + skill + """'}) ON MATCH SET
s.wikidata_url = '""" + str(df['item.value'][b]) + """' RETURN s""")
if switch_case == 1:
cypher_list.append("""MERGE (o:Ontology_Item {wikidata_url: '""" + str(df['subclass_of.value'][b]) + """',
name: '""" + str(df['subclass_ofLabel.value'][b]) + """'})""")
cypher_list.append("""MATCH (s:Skill), (o:Ontology_Item) WHERE s.wikidata_url = '""" + str(df['item.value'][b]) + """' AND
o.wikidata_url = '""" + str(df['subclass_of.value'][b]) + """' MERGE (s)-[rel:SUBCLASS_OF]->(o)""")
if switch_case == 2:
cypher_list.append("""MERGE (o:Ontology_Item {wikidata_url: '""" + str(df['category_of.value'][b]) + """',
name: '""" + str(df['category_ofLabel.value'][b]) + """'})""")
cypher_list.append("""MATCH (s:Skill), (o:Ontology_Item) WHERE s.wikidata_url = '""" + str(df['item.value'][b]) + """' AND
o.wikidata_url = '""" + str(df['category_of.value'][b]) + """' MERGE (s)-[rel:CATEGORY_OF]->(o)""")
if switch_case == 3:
cypher_list.append("""MERGE (o:Ontology_Item {wikidata_url: '""" + str(df['instance_of.value'][b]) + """',
name: '""" + str(df['instance_ofLabel.value'][b]) + """'})""")
cypher_list.append("""MATCH (s:Skill), (o:Ontology_Item) WHERE s.wikidata_url = '""" + str(df['item.value'][b]) + """' AND
o.wikidata_url = '""" + str(df['instance_of.value'][b]) + """' MERGE (s)-[rel:INSTANCE_OF]->(o)""")
return OrderedDict((x, True) for x in cypher_list).keys()
graph = Graph("http://127.0.0.1:7474", auth=("neo4j", "root"))
# lookup-list to be used in combination with add_ontology_nodes_to_graph
lookup_dict = {
"item.value": 1,
"itemLabel.value": 1,
"subclass_of.value": 2,
"subclass_ofLabel.value": 2,
"category_of.value": 3,
"category_ofLabel.value": 3,
"instance_of.value": 4,
"instance_ofLabel.value": 4
}
# instead of computing-intense pandas operations, directly create the nodes and relationships
for i in range(len(slides_df)):
tokens_list = tokenize_topics(slides_df.iloc[i,5])
# detect language to annotate in SPARQL query, because [AUTO_LANGUAGE] has performance issues
language_code = detect(str(tokens_list))
for token in tokens_list:
try:
cypher_list = ["""MERGE (s:Skill {name: '""" + token + """'})""", """MATCH (p:Consulting_Profile), (s:Skill) WHERE
p.uuid = '""" + str(i) + """' AND s.name = '""" + token + """' MERGE (p)-[rel:HAS_SKILL]->(s)"""]
for stmnt in cypher_list:
graph.run(stmnt)
service_resp = query_wikidata_service(token, language_code)
df = pandas.io.json.json_normalize(service_resp['results']['bindings'])
if len(df) > 0:
onto_items_list = add_and_link_ontology_nodes(df, token)
for onto_item in onto_items_list:
graph.run(onto_item)
except:
pass