-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractUKdblp.py
325 lines (247 loc) · 12.5 KB
/
extractUKdblp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
from itertools import groupby
from collections import defaultdict, Counter
from msgpack import unpackb, packb
from matchauthors import parse_institutions
import numpy as np
import networkx as nx
# Open the UK authors and papers
def load_all_data():
# (inst, ref_author, dblp_author)
authors_list = unpackb(file("data/author_list.dat", "rb").read())
authors_map = dict((name, inst) for inst, _, name in authors_list)
# print authors_map
# (inst, ref_title, dblpentry)
papers_list = sorted(unpackb(file("data/paper_list.dat", "rb").read()))
inst_papers_selected = {}
for inst, g in groupby(papers_list, key=lambda x:x[0]):
inst_papers_selected[inst] = {}
for _,_,(authors, title, booktitle, year) in g:
for a in authors:
if a in authors_map and authors_map[a] == inst:
# Check we have this author
if a not in inst_papers_selected[inst]:
inst_papers_selected[inst][a] = []
inst_papers_selected[inst][a] += [(authors, title, booktitle, year)]
# Extract all other papers from UK authors.
dblp_data = file("data/allfiles.dat", "rb").read()
dblp_data = unpackb(dblp_data)
all_l = len(dblp_data)
# Pretty names for institutions
institutions = parse_institutions("data/Institution.csv")
author_papers = defaultdict(list)
inst_papers = defaultdict(list)
baseline_venue_count = Counter()
for authors, title, booktitle, year in dblp_data:
if not (2009 <= int(year) <= 2014):
continue
for a in authors:
if a in authors_map:
# Check in
inst_papers[authors_map[a]] += [ (tuple(authors), title, booktitle, year) ]
author_papers[a] += [(tuple(authors), title, booktitle, year)]
baseline_venue_count.update([booktitle])
return (authors_list, authors_map, papers_list, inst_papers_selected, institutions, author_papers, inst_papers, baseline_venue_count)
# Out of institution statistics
def out_of_institution(papers_list, authors_map):
count_authors = Counter()
count_inst = Counter()
count_venues = Counter()
for (inst,_,(authors, _, venue, _)) in papers_list:
count_venues.update([venue])
for a in authors:
if a in authors_map and authors_map[a] != inst:
count_authors.update([a])
count_inst.update([authors_map[a]])
return (count_authors, count_inst, count_venues)
def xcode(name):
if name == None:
return u'None'
return unicode(name, encoding='utf-8')
def rank_digraph(authors_map, inst_papers_selected, author_papers, weighting=True):
G = nx.DiGraph()
ref_venues = set()
# The directed graph approach
for author, inst in authors_map.iteritems():
# No papers from this author found :-(
if author not in inst_papers_selected[inst]:
continue
# Otherwise list the venues selected
included_venues = []
included_papers = set()
for (_, title, venue, _) in inst_papers_selected[inst][author]:
included_papers.add(title)
ref_venues.add(venue)
included_venues += [venue]
G.add_node(xcode(venue))
not_included_venues = []
for (_, title, venue, _) in author_papers[author]:
if title in included_papers:
continue
# if venue not in included_venues:
not_included_venues += [venue]
G.add_node(xcode(venue))
if len(not_included_venues) * len(included_venues) == 0:
continue
if weighting:
w = 1.0 / (len(not_included_venues) * len(included_venues))
else:
w = 1.0
for source in not_included_venues:
source = xcode(source)
for destination in included_venues:
destination = xcode(destination)
if G.has_edge(source, destination):
G[source][destination]['weight'] += w # 1.0
else:
G.add_edge(source, destination)
G[source][destination]['weight'] = w # 1.0
for n in G.nodes():
if n not in ref_venues:
G.remove_node(n)
return G
def rank_paper_digraph(authors_map, inst_papers_selected, author_papers, papers_list):
G = nx.DiGraph()
ref_venues = set( venue for _, _, (authors, title, venue, year) in papers_list)
# The directed graph approach
for author, inst in authors_map.iteritems():
# No papers from this author found :-(
if author not in inst_papers_selected[inst]:
continue
# Otherwise list the venues selected
for (authorsSel, titleSel, venueSel, yearSel) in inst_papers_selected[inst][author]:
if not (2009 <= int(yearSel) <= 2014):
continue
for (authors, title, venue, year) in author_papers[author]:
# Do not care about non REF venues
if venue not in ref_venues:
continue
# Add the paper edge
G.add_node((titleSel, venueSel))
G.add_node((title, venue))
if G.has_edge((title, venue), (titleSel, venueSel)):
G[(title, venue)][(titleSel, venueSel)]["weight"] += 1.0
else:
G.add_edge((title, venue), (titleSel, venueSel), weight=1.0)
# Normalize the graph of papers
for n in G.nodes():
edges = G.out_edges(n)
W = sum((G[u][v]['weight'] for u,v in edges), 0)
for u,v in edges:
G[u][v]['weight'] = G[u][v]['weight'] / W
# Statistics about the venue imbalance
G2 = nx.DiGraph()
for u,v in G.edges():
_, venue = u
_, venueSel = v
# New weight
try:
w = G[u][v]["weight"] # / global_venues_numbers[venueSel] # float(global_venues_numbers[venueSel]) / uk_freq[venueSel]
if G2.has_edge(venue, venueSel):
G2[venue][venueSel]["weight"] += w
else:
G2.add_edge(venue, venueSel, weight=w)
except Exception as e:
print e
return G2
def get_stationary_distribution(G):
all_nodes = G.nodes()
node_map = dict((n,i) for i, n in enumerate(all_nodes))
matrix = np.zeros((len(all_nodes), len(all_nodes)))
for n1, n2 in G.edges():
matrix[node_map[n1], node_map[n2]] = G[n1][n2]['weight']
norm = np.sum(matrix, axis=1)
norm[norm == 0.0] = 1.0
norm = np.array([norm]).transpose()
matrix = matrix.astype(float) / norm
ones = np.sum(matrix, axis=1)
ones[np.isnan(ones)] = 0.0
# global_venues_numbers = unpackb(file("data/venuestats.dat", "rb").read())
# for i, n in enumerate(all_nodes):
# ones[i] = global_venues_numbers[n]
dist = ones / np.sum(ones)
for _ in range(50):
dist = dist.dot(matrix)
return (all_nodes, dist)
def compare_venue_ratios(author_papers, papers_list):
global_venues_numbers = unpackb(file("data/venuestats.dat", "rb").read())
uk_papers = set(sum(author_papers.values(), []))
uk_freq = Counter()
uk_freq.update(venue for _,_,venue,year in uk_papers if 2009 <= int(year) <= 2014)
# How many REF papers?
ref_papers = set((tuple(authors), title, venue, year) for _, _, (authors, title, venue, year) in papers_list)
ref_counts = Counter(venue for _,_,venue,_ in ref_papers)
venues_by_size = sorted(global_venues_numbers, key=lambda x: float(ref_counts[x])/global_venues_numbers[x], reverse=True)
fvp = file("results/venue_popularity.txt","w")
print >>fvp, "Number of papers, fraction of UK papers, fraction of REF papers by venue"
for v in venues_by_size:
if global_venues_numbers[v] < 20:
continue
uk_frequency = uk_freq[v] * 100.0 / global_venues_numbers[v]
ref_frequency = ref_counts[v] * 100.0 / global_venues_numbers[v]
print >>fvp, "%4d\t%02.2f%%\t%02.2f%%\t%s" % (global_venues_numbers[v], uk_frequency , ref_frequency, v)
def main():
(authors_list, authors_map, papers_list, inst_papers_selected, institutions, author_papers, inst_papers, baseline_venue_count) = load_all_data()
(count_authors, count_inst, count_venues) = out_of_institution(papers_list, authors_map)
G = rank_digraph(authors_map, inst_papers_selected, author_papers, True)
# Test new method:
G2 = rank_paper_digraph(authors_map, inst_papers_selected, author_papers, papers_list)
(all_nodes, dist) = get_stationary_distribution(G)
compare_venue_ratios(author_papers, papers_list)
# Institutions lists by papers used by other institutions
frankothers = file("results/rank_institution_by_others.txt", "w")
print >>frankothers, "Rank Institutions by number of papers used by *others* in the REF"
for inst, cnt in count_inst.most_common():
# if inst is not None:
print >>frankothers, "%3d | %s" % (cnt, institutions[inst])
print
# List venues by a ratio of ref / accepted
lst = []
for venue, cnt in count_venues.most_common():
# if inst is not None:
if baseline_venue_count[venue] > 0:
lst += [((float(cnt) * 100 / baseline_venue_count[venue], venue))]
# print "%2.2f %s" % (float(cnt) * 100 / baseline_venue_count[venue], venue)
frankvenratio = file("results/rank_venue_by_ref_paper_ratio.txt", "w")
print >>frankvenratio, "Rank venues by ratio of REF submitted papers vs. available papers"
for cnt, venue in sorted(lst, reverse=True):
if baseline_venue_count[venue] > 4:
print >>frankvenratio, "%2.2f\t%3d\t%s" % (cnt, count_venues[venue], venue)
frankvenratio = file("results/rank_venue_stationary.txt", "w")
print >>frankvenratio, "Rank venues by the rank of the stationary distribution in the selection graph"
venues = sorted([(ni, dist[i]) for i, ni in enumerate(all_nodes)], reverse=True, key=lambda x:x[1])
for venue, cnt in venues:
if cnt > 0.0:
print >>frankvenratio, "%2.2f | %s" % (1000 * cnt, venue)
# Score institutions by quality-research mass
venues_juice = dict(venues)
inst_juice_by_author12 = defaultdict(float)
inst_juice_by_author4 = defaultdict(float)
inst_juice_by_author_all = defaultdict(float)
for a in author_papers:
list_of_juices = []
for authors, title, booktitle, year in author_papers[a]:
if booktitle in venues_juice:
list_of_juices += [ venues_juice[booktitle] ]
# Include only 4 outputs as in the REF
inst_juice_by_author12[authors_map[a]] += sum(sorted(list_of_juices, reverse=True)[:12]) / len(inst_papers_selected[inst])
inst_juice_by_author4[authors_map[a]] += sum(sorted(list_of_juices, reverse=True)[:4]) / len(inst_papers_selected[inst])
inst_juice_by_author_all[authors_map[a]] += sum(sorted(list_of_juices, reverse=True)) / len(inst_papers_selected[inst])
from parse_outputs import parse_outputs
outputs = parse_outputs("data/Outcomes.csv")
ref_rank_f = lambda x:outputs[x][0]*4 + outputs[x][1]*3 + outputs[x][2]*2 + outputs[x][3]*1
ref_sorted_inst = sorted([inst for inst in outputs], reverse=True, key=ref_rank_f)
ref_rank = dict([(inst, i) for i, inst in enumerate(ref_sorted_inst)])
sel4 = sorted(institutions, reverse=True, key=lambda inst: inst_juice_by_author4[inst])
sel4_rank = dict([(inst, i) for i, inst in enumerate(sel4)])
selall = sorted(institutions, reverse=True, key=lambda inst: inst_juice_by_author_all[inst])
selall_rank = dict([(inst, i) for i, inst in enumerate(selall)])
frankvenratio = file("results/rank_institution_stationary.txt", "w")
print >>frankvenratio, "Rank institutions by the rank of the stationary distribution in the selection graph of the venues their staff publish"
for i, inst in enumerate(sorted(institutions, reverse=True, key=lambda inst: inst_juice_by_author12[inst])):
if inst_juice_by_author12[inst] > 0.005:
inst_stars = outputs[inst][0]
#print >>frankvenratio,"**%d** (%2.2f) | **%+d**\t(%3d) | **%+d**\t(%2.2f) | **%+d**\t(%2.2f) | **%s**" % (i, inst_juice_by_author12[inst], ref_rank[inst] - i, ref_rank_f(inst),
# sel4_rank[inst]-i, inst_juice_by_author4[inst], selall_rank[inst]-i, inst_juice_by_author_all[inst], institutions[inst])
print >>frankvenratio,"%d (%2.2f) | %d (%+d) | %s" % (i + 1, inst_juice_by_author12[inst], ref_rank[inst] + 1, ref_rank[inst] - i, institutions[inst])
if __name__ == "__main__":
main()