/
prep_node.py
127 lines (90 loc) · 3.23 KB
/
prep_node.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os,sys
import zipfile
import extract_refs, hashish, get_arxiv_meta, sub_mapping
from py2neo import authenticate, Graph, Node, Relationship
def prep_node(graph, f3, in2, ppr_id, meta_res, update = False):
# Node creation
refs = []
text = ''
available = False
if update:
item_count = 1
else:
item_count = len(os.listdir(in2))
c_title = hashish.compress(meta_res['tit'])
rp_node = graph.find_one("Paper", "id", hashish.get_hash(c_title))
if rp_node:
rp = rp_node
rp['complete'] = "T"
rp.push()
else:
rp = Node("Paper", name = c_title, id = hashish.get_hash(c_title), title = meta_res['tit'], q_score = 0.01, complete = "T")
graph.create(rp)
for a in meta_res['aut']:
try:
a = a.decode('utf-8')
except:
a = a.decode('latin-1')
author_node = graph.find_one("Author", "name", a)
if author_node:
aut = author_node
else:
aut = Node("Author", name=a)
graph.create(aut)
graph.create(Relationship(aut, "Published", rp))
for c in meta_res['cat']:
C = c.decode('utf-8')
subject_name = sub_mapping.map_id_to_name(C)
if not subject_name:
continue
category_node = graph.find_one("Category", "name", C)
if category_node:
cat = category_node
else:
cat = Node("Category", name=C, subject = subject_name)
graph.create(cat)
graph.create(Relationship(rp, "BelongsTo", cat))
# Extract References
in3 = in2 + '/' + ppr_id
if item_count == 1:
with zipfile.ZipFile(in2+"/"+f3,"r") as zeep:
os.makedirs(in3)
zeep.extractall(in3)
for f4 in os.listdir(in3):
if f4.lower().endswith(".bbl"):
available = True
with open(in3+ '/' +f4) as f:
try:
text = text + '\n' + f.read()
except:
pass
break
if not available:
for f4 in os.listdir(in3):
if f4.lower().endswith(".tex"):
available = True
with open(in3+ '/' +f4) as f:
try:
text = text + '\n' + f.read()
except:
pass
# Create Reference Relationship
if available:
refs = extract_refs.lets_hit_it(text)
ref_list = []
ref_list.append(rp['q_score'])
for ref in refs:
comp_title = hashish.compress(ref)
if not comp_title:
continue
h_title = hashish.get_hash(comp_title)
paper_node = graph.find_one("Paper", "id", h_title)
if paper_node:
ppr = paper_node
else:
ppr = Node("Paper", name=comp_title, id=h_title, title=ref.lstrip().rstrip(), q_score = 0.01, complete = "F")
graph.create(ppr)
if len(list(graph.match(start_node=rp, end_node=ppr, rel_type="Refers"))) == 0 and (rp != ppr):
graph.create(Relationship(rp, "Refers", ppr))
ref_list.append(ppr)
return ref_list