def AuthorsToPubs(adata, flds): a_to_pubs = {} print(len(adata.keys())) # put author data in the DB with assigned id and list of papers authors_w_p = {} for num in range(len(adata.keys())): au = adata.keys()[num] adata_nums = {} # starting from 1 billion a_id = 1000000000 + num + 1 adata_nums["assigned_id"] = a_id for f in range(len(flds)): adata_nums[flds[f]] = adata.keys()[num].split(";")[f] p_list = [] for pps in adata[au]: pubmed = pps[0] position = pps[1] coauthors = pps[2] p_list.append({ "pubmed": pubmed, "position": position, "coauthors": coauthors }) try: a_to_pubs[pubmed].append(adata_nums["assigned_id"]) except: a_to_pubs[pubmed] = [adata_nums["assigned_id"]] adata_nums["papers"] = p_list authors_w_p[a_id] = adata[au] # print num, adata.keys()[num], adata[ adata.keys()[num] ] # to link all co-authors with assigned id in the paper: DBcall("authors", num).loadData([adata_nums]) # print num + 1, adata[au] return a_to_pubs, authors_w_p
def AuthorsToPubs(adata, flds): a_to_pubs = {} print len(adata.keys()) # put author data in the DB with assigned id and list of papers authors_w_p = {} for num in range(len(adata.keys())): au = adata.keys()[num] adata_nums = {} adata_nums["assigned_id"] = num + 1 meta = {} for f in range(len(flds)): meta[flds[f]] = adata.keys()[num].split(";")[f] adata_nums["meta"] = meta adata_nums["papers"] = adata[au] authors_w_p[num + 1] = adata[au] # print num, adata.keys()[num], adata[ adata.keys()[num] ] # to link all co-authors with assigned id in the paper: for p in adata[au]: try: a_to_pubs[p].append(adata_nums["assigned_id"]) except: a_to_pubs[p] = [adata_nums["assigned_id"]] DBcall("authors", num).loadData([adata_nums]) print num + 1, adata[au] return a_to_pubs, authors_w_p
def MakeGraphData(a_to_pubs): keys = {"assigned_id": {'$gt': 0}} authors = DBcall("authors", 0).findDatalist(keys) e_tuples = {} for au in authors: # print nm, pbs for p in au["papers"]: all_a = a_to_pubs[p] for a in all_a: if a != au["assigned_id"]: try: e_tuples[a, au["assigned_id"]] += 1 except: e_tuples[a, au["assigned_id"]] = 1 return e_tuples
data["num_of_authors"] = len(ca_list) for k, v in names_p.iteritems(): if k == elem.tag: data[v] = elem.text if elem.tag == "Keyword": kw_list.append(elem.text) # print event, elem.tag, elem.attrib, '>', elem.text, '<', date #, json if elem.tag == 'Author' and event == 'end': a_data["position"] = len(ca_list) + 1 ca_list.append(a_data) a_data = {} if elem.tag == 'PubmedArticle' and event == 'end': data["authors"] = ca_list data["keywords"] = kw_list keys = {"pubmed": data["pubmed"]} paper = DBcall("papers", 0).findData(keys) if paper: data["xml_source_update"] = gz DBcall("papers", n).updateData(keys, data) updts += 1 else: data["xml_source"] = gz DBcall("papers", n).loadData([data]) insrts += 1 p_data = {} data = {} ca_list = [] kw_list = [] n += 1 nn += 1 print str(datetime.now())[:-7] + " : finished updating " + str(nn) + "th " + xmL + " with " + str(updts) + " updates and " + str(insrts) + " inserts"
def main(): flds = ["first_name", "last_name", "initials", "af_place", "af_country"] print str(datetime.now())[:-7] + " : extract data" keys = {"pubmed": {'$gt': 0}} # or findDatalistSorted(keys, "pubmed", 1).limit(1000) papers = DBcall("papers", 0).findDatalist(keys).limit(10) sp = {} pps = [] for p in papers: pps.append(p["pubmed"]) if len(p["authors"]) > 0: sp[p["pubmed"]] = p["authors"] # authors with their papers (more than 1 paper): print str(datetime.now())[:-7] + " : start data linking" pubs_to_author = idAuthor(sp, flds) """ we might have got here a list of authors where some has different fields, but the same firs_name, last_name, initials: merge authors by longest metadata if they are met under the same name (first, last name, initials) in the same paper list (name + affiliation): the same metadata also can be RW differently being in different encoding """ edges = [] adata = dict(pubs_to_author) print len(pubs_to_author.keys()) for nm, pbs in pubs_to_author.iteritems(): for p in pbs: for c_nm, c_pbs in pubs_to_author.iteritems(): # if in the same p in a list of pprs assigned to another author... if p in c_pbs and nm != c_nm: # if metadata are different but name is the same: nm_lst = nm.split(";")[:2] c_nm_lst = c_nm.split(";")[:2] if nm_lst == c_nm_lst: if len(nm) >= len(c_nm): try: del adata[c_nm] except: continue else: try: del adata[nm] except: continue a_to_pubs, authors_w_p = AuthorsToPubs(adata, flds) print list(set(pubs_to_author.keys()) - set(adata.keys())) e_tuples = MakeGraphData(a_to_pubs) aus = len(authors_w_p.keys()) print str(datetime.now())[:-7] + " : " + str(aus) + " authors" print str(datetime.now())[:-7] + " : end linking" count_n = len(e_tuples.keys()) G = nx.Graph() seed = [(x, y) for x in range(count_n) for y in range(1, count_n)] nodes = {} for node, weight in e_tuples.iteritems(): n1, n2 = node for nn in node: try: nodes[nn] except: G.add_node(nn) [G.node[nn]['pos']] = random.sample(seed, 1) nodes[nn] = 1 G.add_edge(n1, n2, weight=e_tuples[node]) pos = nx.get_node_attributes(G, 'pos') dmin = 1 ncenter = 0 for n in pos: x, y = pos[n] d = (x-0.5)**2+(y-0.5)**2 if d < dmin: ncenter = n dmin = d # p = nx.single_source_shortest_path_length(G, ncenter) print G.nodes() # print G.edges() edge_trace = Scatter( x=[], y=[], line=Line(width=0.5, color='#888'), hoverinfo='none', mode='lines') for edge in G.edges(): # print edge x0, y0 = G.node[edge[0]]['pos'] x1, y1 = G.node[edge[1]]['pos'] edge_trace['x'] += [x0, x1, None] edge_trace['y'] += [y0, y1, None] node_trace = Scatter( x=[], y=[], text=[], mode='markers', hoverinfo='text', marker=Marker( showscale=True, # colorscale other options # 'Greys' | 'Greens' | 'Bluered' | 'Hot' | 'Picnic' | 'Portland' | # Jet' | 'RdBu' | 'Blackbody' | 'Earth' | 'Electric' | 'YIOrRd' colorscale='YIGnBu', reversescale=True, color=[], size=10, colorbar=dict( thickness=15, title='Node Connections', xanchor='left', titleside='right' ), line=dict(width=2))) for node in G.nodes(): x, y = G.node[node]['pos'] node_trace['x'].append(x) node_trace['y'].append(y) print "Xs:" + str(len(node_trace['x'])) + " coord: " + str(node_trace['x'][4]) print "Ys:" + str(len(node_trace['y'])) + " coord: " + str(node_trace['y'][4]) # for gn in G.nodes(): # print gn, G.neighbors(gn) # >>> G.add_path([0,1,2,3]) # >>> [(n,nbrdict) for n,nbrdict in G.adjacency_iter()] # [(0, {1: {}}), (1, {0: {}, 2: {}}), (2, {1: {}, 3: {}}), (3, {2: {}})] # print enumerate(G.adjacency_list()) for node, adjacencies in G.adjacency_iter(): node_trace['marker']['color'].append(len(adjacencies)) node_info = str(node) + ' # of connections: '+str(len(adjacencies)) node_trace['text'].append(node_info) fig = Figure(data=Data([edge_trace, node_trace]), layout=Layout( title='<br>Network graph made with Python', titlefont=dict(size=16), showlegend=False, hovermode='closest', margin=dict(b=20, l=5, r=5, t=40), annotations=[dict( text="Thanks to PlotLy", showarrow=False, xref="paper", yref="paper", x=0.005, y=-0.002)], xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False), yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False))) plotly.offline.plot(fig, filename='networkx')
if re.search('http://orcid.org', elem.text): a_data["orcid"] = elem.text elif re.search('-', elem.text) and not re.search('http://orcid.org', elem.text): a_data["orcid"] = 'http://orcid.org/' + elem.text else: a_data["orcid"] = 'http://orcid.org/' + elem.text[0:4] + "-" + elem.text[4:8] + "-" + elem.text[8:12] + "-" + elem.text[12:16] data["num_of_authors"] = len(ca_list) if elem.tag == "Keyword": kw_list.append(elem.text) # print event, elem.tag, elem.attrib, '>', elem.text, '<', date #, json if elem.tag == 'Author' and event == 'end': a_data["position"] = len(ca_list) + 1 ca_list.append(a_data) a_data = {} if elem.tag == 'PubmedArticle' and event == 'end': data["authors"] = ca_list data["keywords"] = kw_list data["xml_source"] = gz DBcall("papers", n).loadData([data]) p_data = {} data = {} ca_list = [] kw_list = [] n += 1 nn += 1 print str(datetime.now())[:-7] + " : finished loading " + str(nn) + "th " + xmL # with open(xmL, 'rb') as f_in, gzip.open(xmL + '.gz', 'wb') as f_out: # shutil.copyfileobj(f_in, f_out) os.unlink(xmL) print str(datetime.now())[:-7] + " : done"
def main(): flds = [ "first_name", "last_name", "initials", "af_place", "af_country", "orcid", "email" ] print(str(datetime.now())[:-7] + " : extract data") keys = {"pubmed": {'$gt': 0}} # or findDatalistSorted(keys, "pubmed", 1).limit(1000) papers = DBcall("papers", 0).findDatalist(keys).limit(10000) authors_to_pubmed = {} pps = [] for p in papers: pps.append(p["pubmed"]) if len(p["authors"]) > 0: authors_to_pubmed[p["pubmed"]] = p["authors"] # positions = [] # for p_a in p["authors"]: # positions.append(p_a["positions"]) # authors with their papers (more than 1 paper): print(str(datetime.now())[:-7] + " : start data linking") pubs_to_author = idAuthor(authors_to_pubmed, flds) """ we might have got here a list of authors where some has different fields, but the same firs_name, last_name, initials: so let's merge authors by longest metadata if they are met under the same name (first, last, initials) in the same paper list (name+affiliation): the same metadata also can be RW differently being in different encoding """ edges = [] adata = dict(pubs_to_author) print(len(pubs_to_author.keys())) for nm, pbs in pubs_to_author.iteritems(): for p in pbs: for c_nm, c_pbs in pubs_to_author.iteritems(): # if the same p in this list of pprs assigned to # another author... if p in c_pbs and nm != c_nm: # if metadata are different but name is the same: nm_lst = nm.split(";")[:2] c_nm_lst = c_nm.split(";")[:2] if nm_lst == c_nm_lst: if len(nm) >= len(c_nm): try: del adata[c_nm] except: continue else: try: del adata[nm] except: continue a_to_pubs, authors_w_p = AuthorsToPubs(adata, flds) print(list(set(pubs_to_author.keys()) - set(adata.keys()))) e_tuples = MakeGraphData(a_to_pubs) count_n = len(e_tuples.keys()) aus = len(authors_w_p.keys()) print(str(datetime.now())[:-7] + " : " + str(aus) + " authors") print(str(datetime.now())[:-7] + " : end linking") G = nx.Graph() nodes = {} for node, weight in e_tuples.iteritems(): n1, n2 = node for nn in node: try: nodes[nn] except: G.add_node(nn) nodes[nn] = 1 G.add_edge(n1, n2, weight=e_tuples[node])
def main(): G = nx.Graph() flds = ["first_name", "last_name", "initials", "af_place", "af_country"] # papers = DBcall("papers", 0).findDatalistSorted(keys, "pubmed", 1).limit(1000) print str(datetime.now())[:-7] + " : extract data" keys = {"pubmed": {'$gt': 0}} papers = DBcall("papers", 0).findDatalist(keys).limit(100) sp = {} pps = [] for p in papers: pps.append(p["pubmed"]) if len(p["authors"]) > 0: sp[p["pubmed"]] = p["authors"] # authors with their papers (more than 1 paper): print str(datetime.now())[:-7] + " : start data linking" pubs_to_author = idAuthor(sp, flds) """ we might have got here a list of authors where some has different fields, but the same firs_name, last_name, initials: merge authors by longest metadata if they are met under the same name (first, last name, initials) in the same paper list (name + affiliation): the same metadata also can be RW differently being in different encoding """ edges = [] adata = dict(pubs_to_author) print len(pubs_to_author.keys()) for nm, pbs in pubs_to_author.iteritems(): for p in pbs: for c_nm, c_pbs in pubs_to_author.iteritems(): # if in the same paper in a list of paper assigned to another author... if p in c_pbs and nm != c_nm: # if metadata are different but name is the same: nm_lst = nm.split(";")[:2] c_nm_lst = c_nm.split(";")[:2] if nm_lst == c_nm_lst: if len(nm) >= len(c_nm): try: del adata[c_nm] except: continue else: try: del adata[nm] except: continue a_to_pubs = {} print len(adata.keys()) # put author data in the DB with assigned id and list of papers for num in range(len(adata.keys())): adata_nums = {} adata_nums["assigned_id"] = num + 1 meta = {} for f in range(len(flds)): meta[flds[f]] = adata.keys()[num].split(";")[f] adata_nums["meta"] = meta adata_nums["papers"] = adata[adata.keys()[num]] # print num, adata.keys()[num], adata[ adata.keys()[num] ] # to link all co-authors with assigned id in the paper: for p in adata[adata.keys()[num]]: try: a_to_pubs[p].append(adata_nums["assigned_id"]) except: a_to_pubs[p] = [adata_nums["assigned_id"]] DBcall("authors", num).loadData([adata_nums]) print list(set(pubs_to_author.keys()) - set(adata.keys())) keys = {"assigned_id": {'$gt': 0}} authors = DBcall("authors", 0).findDatalist(keys) e_tuples = {} for au in authors: # print nm, pbs for p in au["papers"]: all_a = a_to_pubs[p] for a in all_a: if a != au["assigned_id"]: try: e_tuples[a, au["assigned_id"]] += 1 except: e_tuples[a, au["assigned_id"]] = 1 for node, weight in e_tuples.iteritems(): n1, n2 = node G.add_nodes_from(node) G.add_edge(n1, n2, weight=e_tuples[node]) print "components: " print nx.connected_components(G) # print "degree: " # print nx.degree(G) # draw with node ids: nx.draw_networkx(G) # pos = nx.spring_layout(G) # nx.draw_networkx_nodes(G, pos=pos, nodelist=G.nodes()) # nx.draw_networkx_edges(G, pos=pos, edgelist=G.edges()) plt.show() # print list(set(sp.keys()) - set(a_to_pubs.keys())) # for pp, au in a_to_pubs.iteritems(): # print pp, au print str(datetime.now())[:-7] + " : " + str(len( a_to_pubs.keys())) + " authors adding:" print str(datetime.now())[:-7] + " : end linking"
date = datetime.strptime( childs["Year"] + "-" + childs["Month"] + "-" + childs["Day"], "%Y-%m-%d") except: day = int(childs["Day"]) - 2 date = datetime.strptime( childs["Year"] + "-" + str(month) + "-" + str(day), "%Y-%m-%d") putLog( "Date format was wrong for pid (pid = records number), corrected for -2 days", n, "XML parcing" + gz, "format") if v == "received": data["received_date"] = date if v == "accepted": data["accepted_date"] = date if k == "IdType" and v == "pubmed" and elem.tag == "ArticleId": data["pubmed"] = 0 if elem.text is not None: pubmed = int(elem.text) if elem.tag == 'PubmedArticle' and event == 'end': keys = {"pubmed": pubmed} DBcall("papers", n).updateData(keys, data) data = {} pubmed = 0 n += 1 nn += 1 print str( datetime.now())[:-7] + " : finished updateing " + str(nn) + "th " + xmL os.unlink(xmL) print str(datetime.now())[:-7] + " : done"
nn = 0 for gz in sorted(gzips)[:110]: values = {} keys = {} xmL = xmLpath + gz with gzip.open(xmL, 'rb') as f: xmL = xmL[:-3] with open(xmL, "w") as ff: try: ff.write(f.read()) except Exception, e: print "failed to unpack " + gz + "exception: " + e continue values["xml_source"] = gz print str(datetime.now())[:-7] + " : starting to update data from " + xmL for event, elem in etree.iterparse(xmL, events=('start', 'end', 'start-ns', 'end-ns')): childs = {"Year": "1000", "Month": "1", "Day": "1"} if event == 'start': for k, v in elem.attrib.iteritems(): if k == "IdType" and v == "pubmed": keys["paper.pubmed"] = elem.text if elem.tag == 'PubmedArticle' and event == 'end': DBcall("papers", nn).updateData(keys, values) nn += 1 print str(datetime.now())[:-7] + " : finished updating from " + str( nn) + "th " + xmL os.unlink(xmL) print str(datetime.now())[:-7] + " : done"