-
Notifications
You must be signed in to change notification settings - Fork 0
/
edge_weight.py
549 lines (422 loc) · 19 KB
/
edge_weight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
#Copyright (C) Systems & Technology Research
#Use of this software is subject to the restrictions in license.txt
#Distribution A: Approved for Public Release, Distribution Unlimited
import json
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import numpy as np
import os
from Bio import Entrez
from Bio.Entrez import efetch, read
from Bio import Medline
import networkx as nx
import h_index_finder.h_index_finder as h_index_finder
import holoviews as hv
from holoviews import opts
import matplotlib.pyplot as plt
import datashader as ds
import datashader.transfer_functions as tf
from datashader.layout import random_layout, circular_layout, forceatlas2_layout
from datashader.bundling import connect_edges, hammer_bundle
from fa2 import ForceAtlas2
import seaborn as sns
#from sqlalchemy import create_engine
hv.extension('bokeh')
from bokeh.models import HoverTool
# Global Variables
id_2_name_dict = None
id_2_category_dict = None
all_pmids = None
json_data = None
pmid_dict = None
rel_name = None
api_key = None
# Blotzman Function
def boltzman(x, xmid, tau):
return 1. / (1. + np.exp(-(x-xmid)/tau))
# Collect required information from NCBI database to calculate all different ways of edge weights
'''
Following information are collected:
Author names
Authors' affiliations
Publication date of paper/pmid-pmcid
All pmids citing this paper
Grants used to publish this paper
Note: If a pmid refers to pmcid, use pmcid to collect information since it has more details (e.g., full author
names rather than first initial and last name)
'''
def collect_NCBI():
global all_pmids
global pmid_dict
if os.path.exists(f'./{rel_name}/{rel_name}_pmid_dict.json'):
with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'r') as f:
jd = f.read()
temp_dict = json.loads(jd)
pmid_dict.update(temp_dict)
return pmid_dict
for idx in tqdm(range(len(all_pmids))):
pmid = all_pmids[idx]
# get records for each pmid
fetch_records_handle1 = efetch(db="pubmed", id=str(pmid), rettype="medline",retmode="text")
# parse fetched records
records1 = Medline.parse(fetch_records_handle1)
# Need to iterate over records to extract information
for record1 in records1:
# try except check to be sure that NCBI is not returning empty result
try:
# let's get pmcid if exists
id2 = record1['PMC'][3:]
#print('PMC',id2)
# get records for pmcid
fetch_records_handle2 = efetch(db="pubmed", id=str(id2),
rettype="medline",retmode="text")
# parse records for pmcid
records2 = Medline.parse(fetch_records_handle2)
# Need to iterate over records to extract information
'''
Collect following information: authors, authors' affiliations, publication date, citations, grants
Store all these information in an dictionary (pmid_dict)
'''
for record2 in records2:
authors = record2['FAU']
affiliations = record2['AD']
pub_date = record2['DCOM']
citations = get_links_id(pmid)
grants = record2['GR']
pmid_dict[pmid] = {'pmcid_number':id2,'pmcid':True,'authors':authors,'affiliations':affiliations,'grants':grants,'pub_date':pub_date,'citations':citations}
except:
authors = record1['FAU']
try:
affiliations = record1['AD']
except:
affiliations = ''
try:
pub_date = record1['DCOM']
except:
pub_date = ''
try:
citations = get_links_id(pmid)
except:
citations = ''
try:
grants = record1['GR']
except:
grants = ''
pmid_dict[pmid] = {'pmcid_number':'','pmcid':False,'authors':authors,'affiliations':affiliations,'grants':grants,'pub_date':pub_date,'citations':citations}
with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'w') as output:
output.write(json.dumps(pmid_dict))
return pmid_dict
# Let's get citations from pmid
'''
This function gets all pmids citing a given pmid
'''
def get_links_id(pmid):
link_list = []
links = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pmc_refs")
record = Entrez.read(links)
records = record[0][u'LinkSetDb'][0][u'Link']
for link in records:
link_list.append(link[u'Id'])
return link_list
# Edge Weight Based on Citations
'''
Get all citation information from pmid_dict
'''
def get_citation_edge_weights():
global pmid_dict
global json_data
all_citations = []
# let's track the max # of citations
max_length_citations = 0
for key in pmid_dict.keys():
citations = pmid_dict[key]['citations']
if len(citations) > 1: # double check for length of number of citations
if len(citations) > max_length_citations:
# update length of max. # of citations
max_length_citations = len(citations)
# collect all citations in a lit
for citation in citations:
all_citations.append(int(citation))
elif len(citations) == 1:
all_citations.append(int(citations[0]))
citation_weights = dict()
for idx in range(len(json_data['links'])):
cnt = 0 #TODO, maybe this should be 1? No PMIDS = 0 edgeweight right now
# get all pmids from original json file (edges have more than one pmids)
if json_data['links'][idx]['edgetype'].split(':')[1] != '':
pmids = json_data['links'][idx]['edgetype'].split(':')[1].split(',')
for pmid in pmids:
if pmid == '':
continue
cnt += len(pmid_dict[pmid]['citations'])
# calculate edge weight based on # of citations normalized by max # of citations
w = cnt/(max_length_citations+1)
t_str = json_data['nodes'][json_data['links'][idx]['source']]['name']+'-->'+json_data['nodes'][json_data['links'][idx]['target']]['name']
citation_weights[t_str] = w
return citation_weights
# Edge Weight Based on Paper Publication Date
def get_publication_edge_weights():
global pmid_dict
global json_data
# Parse publication date as datetime (it should be added into the loop where we create pmid_dict to hold all information)
pmid_dict_year = dict()
for pmid in pmid_dict.keys():
if pmid_dict[pmid]['pub_date'] != '':
pmid_dict_year[pmid] = datetime.strptime(pmid_dict[pmid]['pub_date'],'%Y%m%d').year
else:
pmid_dict_year[pmid] = -np.infty
# Calculate publication edge weight
edge_weights_from_pub_year = dict()
for idx in range(len(json_data['links'])):
# let's find the oldest publication in an edge for all given edge-pmids
oldest_pub_year = -np.infty
if json_data['links'][idx]['edgetype'].split(':')[1] != '':
pmids = json_data['links'][idx]['edgetype'].split(':')[1].split(',')
for pmid in pmids:
if pmid == '':
continue
if pmid_dict_year[pmid] > oldest_pub_year:
oldest_pub_year = pmid_dict_year[pmid]
# calculate edge weight assume that mid and tau are 10 and 2, respectively
w = (1-boltzman(2020-oldest_pub_year, 10, 3))/(1-boltzman(0, 10, 3))
else:
w = 0 # there is no pmids so this should be edge weight of 0
t_str = json_data['nodes'][json_data['links'][idx]['source']]['name']+'-->'+json_data['nodes'][json_data['links'][idx]['target']]['name']
edge_weights_from_pub_year[t_str] = w
return edge_weights_from_pub_year
# Let create an edge weight based on fusing publication year and # of citations edge weights (equally weight)
'''
w = 0.5 * weight from pub_year + 0.5 * weight from # of citations
it might be good idea to provide user adjustable weights
'''
def get_publication_citation_weights():
global pmid_dict
global json_data
# Parse publication date as datetime (it should be added into the loop where we create pmid_dict to hold all information)
pmid_dict_year = dict()
for pmid in pmid_dict.keys():
if pmid_dict[pmid]['pub_date'] != '':
pmid_dict_year[pmid] = datetime.strptime(pmid_dict[pmid]['pub_date'],'%Y%m%d').year
else:
pmid_dict_year[pmid] = -np.infty
edge_weights_from_pub_year_and_citation = dict()
for idx in range(len(json_data['links'])):
if json_data['links'][idx]['edgetype'].split(':')[1] != '':
# find the oldest publication for a given edge
oldest_pub_year = -np.infty
pmids = json_data['links'][idx]['edgetype'].split(':')[1].split(',')
for pmid in pmids:
if pmid == '':
continue
if pmid_dict_year[pmid] > oldest_pub_year:
oldest_pub_year = pmid_dict_year[pmid]
# find # of citations for a given edge
cnt = 1
pmids = json_data['links'][idx]['edgetype'].split(':')[1].split(',')
for pmid in pmids:
if pmid == '':
continue
cnt += len(pmid_dict[pmid]['citations'])
# calculte edge weight
w = (1-boltzman(2020-oldest_pub_year, 10, 3))/(1-boltzman(0, 10, 3))*0.5+(1-boltzman(cnt, 15, 3))*0.5
else:
w = 0 # no pmids = edge of 0
t_str = json_data['nodes'][json_data['links'][idx]['source']]['name']+'-->'+json_data['nodes'][json_data['links'][idx]['target']]['name']
edge_weights_from_pub_year_and_citation[t_str] = w
return edge_weights_from_pub_year_and_citation
def get_h_index_weights(email):
global json_data
global pmid_dict
global id_2_category_dict
global id_2_name_dict
global api_key
# Collect H-Index for all authors for a given subgraph
print('Loading all Authors into List...')
author_2_h_index = dict()
authors = list()
for key in tqdm(pmid_dict.keys()):
for item in pmid_dict[key]['authors']:
authors.append(item)
print('Collecting H-Index for All Authors using h_index_finder.py')
temp_author_dict = h_index_finder.find_h_index(email, authors, api_key)
author_2_h_index.update(temp_author_dict)
# Let's create an edge weight based on h-index
'''
The idea is that the highest h-index will be used for edge-weight calculation.
'''
G_h_index = nx.Graph()
source = []
target = []
edge_weights_from_h_index = dict()
for idx in range(len(json_data['links'])):
# get source(s) and target(t) nodes
s = json_data['links'][idx]['source']
t = json_data['links'][idx]['target']
# create an edge
e = (s,t)
# add nodes into the graph
G_h_index.add_node(s,name = id_2_name_dict[s],
categories=id_2_category_dict[json_data['links'][idx]['source']])
G_h_index.add_node(t,name=id_2_name_dict[t],
categories=id_2_category_dict[json_data['links'][idx]['target']])
max_h_index = -1
# find the highest h-index for a given edge
if json_data['links'][idx]['edgetype'].split(':')[1] != '':
pmids = json_data['links'][idx]['edgetype'].split(':')[1].split(',')
for pmid in pmids:
if pmid == '':
continue
authors = pmid_dict[pmid]['authors']
max_h_index = -1
for author in authors:
if author_2_h_index[author] > max_h_index:
max_h_index = author_2_h_index[author]
# calculte edge weight by mapping maximum h-index to Boltzmann function
w = boltzman(max_h_index,20,2)
else:
w = 0 # no pmids = edgeweight of 0
t_str = json_data['nodes'][json_data['links'][idx]['source']]['name']+'-->'+json_data['nodes'][json_data['links'][idx]['target']]['name']
edge_weights_from_h_index[t_str] = w
# create an edge
G_h_index.add_edge(*e,value=w)
source.append(s)
target.append(t)
# edge_weights_from_h_index = pd.DataFrame({'source':source,'target':target})
G_h_index.number_of_nodes(),G_h_index.number_of_edges()
return edge_weights_from_h_index
def get_original_edge_weights():
global json_data
global id_2_category_dict
global id_2_name_dict
# Get Original Edge weights
G = nx.Graph()
# extract source nodes, target nodes, and edges from json data
source = []
target = []
edge_weights_from_original = dict()
for idx in range(len(json_data['links'])):
s = json_data['links'][idx]['source']
t = json_data['links'][idx]['target']
e = (s,t)
G.add_node(s,name = id_2_name_dict[s], categories=id_2_category_dict[json_data['links'][idx]['source']])
G.add_node(t,name=id_2_name_dict[t], categories=id_2_category_dict[json_data['links'][idx]['target']])
w = 0
if json_data['links'][idx]['edgetype'].split(':')[1] != '':
w = len(json_data['links'][idx]['edgetype'].split(':')[1].split(','))/len(all_pmids)
t_str = json_data['nodes'][json_data['links'][idx]['source']]['name']+'-->'+json_data['nodes'][json_data['links'][idx]['target']]['name']
edge_weights_from_original[t_str] = w
G.add_edge(*e,value=w)
source.append(s)
target.append(t)
edges = pd.DataFrame({'source':source,'target':target})
G.number_of_nodes(),G.number_of_edges()
# initiate forceatlas algorithm to calculate node positions
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=True, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=10.0,
# Performance
jitterTolerance=10.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=15.0,
strongGravityMode=False,
gravity=50,
# Log
verbose=True)
# calculate nodes' positions
positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=10000)
# let's plot the same graph via holoviews
'''
click on nodes to see the details
'''
hv.Graph.from_networkx(G,positions,kwargs={'weight':'weight'}).opts(directed=True,node_size=10,arrowhead_length=0.009,tools=['hover']).options(color_index='categories', cmap='Category10')
# return original edge weights
return edge_weights_from_original
# START HERE
'''
This is the public function to calculate edge weights
Args:
sub_graph_json: a json object already read in from a file
email: a valid email for api calls
json_name: name of json file or relationship
Returns:
dataframe: dataframe containing each relationship and the various edge weights
Also creates a csv from the dataframe save to a folder called {json_name}
'''
def calculate_edge_weights(sub_graph_json, email, json_name, apikey=None):
global id_2_name_dict
global id_2_category_dict
global all_pmids
global json_data
global pmid_dict
global rel_name
global api_key
rel_name = json_name
api_key = apikey
id_2_name_dict = dict()
id_2_category_dict = dict()
all_pmids = []
json_data = dict()
pmid_dict = dict()
json_data = json.loads(sub_graph_json)
Entrez.email = email
if not (os.path.exists(f'./{rel_name}')):
os.mkdir(f'./{rel_name}')
# Book-Keeping of Node Attributes
for idx in range(len(json_data['nodes'])):
id_2_name_dict[json_data['nodes'][idx]['id']] = json_data['nodes'][idx]['name']
id_2_category_dict[json_data['nodes'][idx]['id']] = json_data['nodes'][idx]['category']
# List all pmids from json file
print('Creating a List of All PMIDS in Subgraph')
for idx1 in range(len(json_data['links'])):
if json_data['links'][idx1]['edgetype'].split(':')[1] != '':
pmids = json_data['links'][idx1]['edgetype'].split(':')[1].split(',')
for idx2 in range(len(pmids)):
pmid = pmids[idx2]
if pmid not in all_pmids and (len(pmid) > 1):
all_pmids.append(pmid)
# Collect required information from NCBI database
print('Collecting PMID Metadata from NCBI')
collect_NCBI()
# Edge weight based on original calculation
print('-'*20)
print('Calculating Original Edge Weights...')
edge_weight_original = get_original_edge_weights()
# Edge weight based on # of citations (Boltzmann Citation Weight)
print('-'*20)
print('Calculating Citation Edge Weights...')
edge_weight_citations = get_citation_edge_weights()
# Edge weight based on paper publication date (Publication Year Weight)
print('-'*20)
print('Calculating Publication-Data Edge Weights...')
edge_weight_publications = get_publication_edge_weights()
# Combine above methods (Publication Year and Citation Weight)
print('-'*20)
print('Calculating Combo Citation/Publication Edge Weights...')
edge_weight_citation_and_pub = get_publication_citation_weights()
# Edge weight based on h-index of author (H-Index Weight)
print('-'*20)
print('Calculating Author H-Index Edge Weights...')
edge_weight_h_index = get_h_index_weights(email)
# Dump edge weights into CSV file
print('-'*20)
print('Dumping Edge Weights to CSV File...')
keys = edge_weight_citation_and_pub.keys()
edge,w_original, w_citation,w_pub_year,w_citation_pub_year,w_h_index = [],[],[],[],[],[]
for key in keys:
edge.append(key)
w_original.append(float(edge_weight_original[key]))
w_citation.append(float(edge_weight_citations[key]))
w_pub_year.append(float(edge_weight_publications[key]))
w_citation_pub_year.append(float(edge_weight_citation_and_pub[key]))
w_h_index.append(float(edge_weight_h_index[key]))
df = pd.DataFrame({'Edge':edge, 'Original_Weight':w_original, 'Boltzmann_Citation_Weight':w_citation,
'Publication_Year_Weight':w_pub_year,'Publication_Year_and_Citation_Weight':w_citation_pub_year, 'H_Index_Weight': w_h_index})
df.to_csv(f'./{rel_name}/{rel_name}.csv',index=False)
return df