Пример #1
0
def AuthorsToPubs(adata, flds):
    a_to_pubs = {}
    print(len(adata.keys()))
    # put author data in the DB with assigned id and list of papers
    authors_w_p = {}
    for num in range(len(adata.keys())):
        au = adata.keys()[num]
        adata_nums = {}
        #  starting from 1 billion
        a_id = 1000000000 + num + 1
        adata_nums["assigned_id"] = a_id
        for f in range(len(flds)):
            adata_nums[flds[f]] = adata.keys()[num].split(";")[f]
        p_list = []
        for pps in adata[au]:
            pubmed = pps[0]
            position = pps[1]
            coauthors = pps[2]
            p_list.append({
                "pubmed": pubmed,
                "position": position,
                "coauthors": coauthors
            })
            try:
                a_to_pubs[pubmed].append(adata_nums["assigned_id"])
            except:
                a_to_pubs[pubmed] = [adata_nums["assigned_id"]]
        adata_nums["papers"] = p_list
        authors_w_p[a_id] = adata[au]
        # print num, adata.keys()[num], adata[ adata.keys()[num] ]
        # to link all co-authors with assigned id in the paper:

        DBcall("authors", num).loadData([adata_nums])
        # print num + 1, adata[au]
    return a_to_pubs, authors_w_p
Пример #2
0
def AuthorsToPubs(adata, flds):
    a_to_pubs = {}
    print len(adata.keys())
    # put author data in the DB with assigned id and list of papers
    authors_w_p = {}
    for num in range(len(adata.keys())):
        au = adata.keys()[num]
        adata_nums = {}
        adata_nums["assigned_id"] = num + 1
        meta = {}
        for f in range(len(flds)):
            meta[flds[f]] = adata.keys()[num].split(";")[f]
        adata_nums["meta"] = meta
        adata_nums["papers"] = adata[au]
        authors_w_p[num + 1] = adata[au]
        # print num, adata.keys()[num], adata[ adata.keys()[num] ]
        # to link all co-authors with assigned id in the paper:
        for p in adata[au]:
            try:
                a_to_pubs[p].append(adata_nums["assigned_id"])
            except:
                a_to_pubs[p] = [adata_nums["assigned_id"]]

        DBcall("authors", num).loadData([adata_nums])
        print num + 1, adata[au]
    return a_to_pubs, authors_w_p
Пример #3
0
def MakeGraphData(a_to_pubs):
    keys = {"assigned_id": {'$gt': 0}}
    authors = DBcall("authors", 0).findDatalist(keys)
    e_tuples = {}
    for au in authors:
        # print nm, pbs
        for p in au["papers"]:
            all_a = a_to_pubs[p]
            for a in all_a:
                if a != au["assigned_id"]:
                    try:
                        e_tuples[a, au["assigned_id"]] += 1
                    except:
                        e_tuples[a, au["assigned_id"]] = 1
    return e_tuples
Пример #4
0
             data["num_of_authors"] = len(ca_list)
         for k, v in names_p.iteritems():
             if k == elem.tag:
                 data[v] = elem.text
         if elem.tag == "Keyword":
             kw_list.append(elem.text)
             # print event, elem.tag, elem.attrib, '>', elem.text, '<', date  #, json
     if elem.tag == 'Author' and event == 'end':
         a_data["position"] = len(ca_list) + 1
         ca_list.append(a_data)
         a_data = {}
     if elem.tag == 'PubmedArticle' and event == 'end':
         data["authors"] = ca_list
         data["keywords"] = kw_list
         keys = {"pubmed": data["pubmed"]}
         paper = DBcall("papers", 0).findData(keys)
         if paper:
             data["xml_source_update"] = gz
             DBcall("papers", n).updateData(keys, data)
             updts += 1
         else:
             data["xml_source"] = gz
             DBcall("papers", n).loadData([data])
             insrts += 1
         p_data = {}
         data = {}
         ca_list = []
         kw_list = []
         n += 1
 nn += 1
 print str(datetime.now())[:-7] + " : finished updating " + str(nn) + "th " + xmL + " with " + str(updts) + " updates and " + str(insrts) + " inserts"
Пример #5
0
def main():
    flds = ["first_name", "last_name", "initials", "af_place", "af_country"]
    print str(datetime.now())[:-7] + " : extract data"
    keys = {"pubmed": {'$gt': 0}}
    # or findDatalistSorted(keys, "pubmed", 1).limit(1000)
    papers = DBcall("papers", 0).findDatalist(keys).limit(10)
    sp = {}
    pps = []
    for p in papers:
        pps.append(p["pubmed"])
        if len(p["authors"]) > 0:
            sp[p["pubmed"]] = p["authors"]
    # authors with their papers (more than 1 paper):
    print str(datetime.now())[:-7] + " : start data linking"
    pubs_to_author = idAuthor(sp, flds)
    """
    we might have got here a list of authors where some has different fields,
    but the same firs_name, last_name, initials:
    merge authors by longest metadata if they are met under the same name
    (first, last name, initials) in the same paper list (name + affiliation):
    the same metadata also can be RW differently being in different encoding
    """
    edges = []
    adata = dict(pubs_to_author)
    print len(pubs_to_author.keys())
    for nm, pbs in pubs_to_author.iteritems():
        for p in pbs:
            for c_nm, c_pbs in pubs_to_author.iteritems():
                # if in the same p in a list of pprs assigned to another author...
                if p in c_pbs and nm != c_nm:
                    # if metadata are different but name is the same:
                    nm_lst = nm.split(";")[:2]
                    c_nm_lst = c_nm.split(";")[:2]
                    if nm_lst == c_nm_lst:
                        if len(nm) >= len(c_nm):
                            try:
                                del adata[c_nm]
                            except:
                                continue
                        else:
                            try:
                                del adata[nm]
                            except:
                                continue

    a_to_pubs, authors_w_p = AuthorsToPubs(adata, flds)

    print list(set(pubs_to_author.keys()) - set(adata.keys()))

    e_tuples = MakeGraphData(a_to_pubs)

    aus = len(authors_w_p.keys())
    print str(datetime.now())[:-7] + " : " + str(aus) + " authors"
    print str(datetime.now())[:-7] + " : end linking"

    count_n = len(e_tuples.keys())
    G = nx.Graph()
    seed = [(x, y) for x in range(count_n) for y in range(1, count_n)]
    nodes = {}
    for node, weight in e_tuples.iteritems():
        n1, n2 = node
        for nn in node:
            try:
                nodes[nn]
            except:
                G.add_node(nn)
                [G.node[nn]['pos']] = random.sample(seed, 1)
                nodes[nn] = 1
        G.add_edge(n1, n2, weight=e_tuples[node])
    pos = nx.get_node_attributes(G, 'pos')

    dmin = 1
    ncenter = 0
    for n in pos:
        x, y = pos[n]
        d = (x-0.5)**2+(y-0.5)**2
        if d < dmin:
            ncenter = n
            dmin = d
    # p = nx.single_source_shortest_path_length(G, ncenter)
    print G.nodes()
    # print G.edges()

    edge_trace = Scatter(
        x=[],
        y=[],
        line=Line(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    for edge in G.edges():
        # print edge
        x0, y0 = G.node[edge[0]]['pos']
        x1, y1 = G.node[edge[1]]['pos']
        edge_trace['x'] += [x0, x1, None]
        edge_trace['y'] += [y0, y1, None]

    node_trace = Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=Marker(
            showscale=True,
            # colorscale other options
            # 'Greys' | 'Greens' | 'Bluered' | 'Hot' | 'Picnic' | 'Portland' |
            # Jet' | 'RdBu' | 'Blackbody' | 'Earth' | 'Electric' | 'YIOrRd'
            colorscale='YIGnBu',
            reversescale=True,
            color=[],
            size=10,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line=dict(width=2)))

    for node in G.nodes():
        x, y = G.node[node]['pos']
        node_trace['x'].append(x)
        node_trace['y'].append(y)
    print "Xs:" + str(len(node_trace['x'])) + " coord: " + str(node_trace['x'][4])
    print "Ys:" + str(len(node_trace['y'])) + " coord: " + str(node_trace['y'][4])
    # for gn in G.nodes():
    #     print gn, G.neighbors(gn)
    # >>> G.add_path([0,1,2,3])
    # >>> [(n,nbrdict) for n,nbrdict in G.adjacency_iter()]
    # [(0, {1: {}}), (1, {0: {}, 2: {}}), (2, {1: {}, 3: {}}), (3, {2: {}})]
    # print enumerate(G.adjacency_list())
    for node, adjacencies in G.adjacency_iter():
        node_trace['marker']['color'].append(len(adjacencies))
        node_info = str(node) + ' # of connections: '+str(len(adjacencies))
        node_trace['text'].append(node_info)
    fig = Figure(data=Data([edge_trace, node_trace]),
                 layout=Layout(
                     title='<br>Network graph made with Python',
                     titlefont=dict(size=16),
                     showlegend=False,
                     hovermode='closest',
                     margin=dict(b=20, l=5, r=5, t=40),
                     annotations=[dict(
                         text="Thanks to PlotLy",
                         showarrow=False,
                         xref="paper", yref="paper",
                         x=0.005, y=-0.002)],
                     xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
                     yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False)))
    plotly.offline.plot(fig, filename='networkx')
Пример #6
0
                    if re.search('http://orcid.org', elem.text):
                        a_data["orcid"] = elem.text
                    elif re.search('-', elem.text) and not re.search('http://orcid.org', elem.text):
                        a_data["orcid"] = 'http://orcid.org/' + elem.text
                    else:
                        a_data["orcid"] = 'http://orcid.org/' + elem.text[0:4] + "-" + elem.text[4:8] + "-" + elem.text[8:12] + "-" + elem.text[12:16]
                data["num_of_authors"] = len(ca_list)
            if elem.tag == "Keyword":
                kw_list.append(elem.text)
                # print event, elem.tag, elem.attrib, '>', elem.text, '<', date  #, json
        if elem.tag == 'Author' and event == 'end':
            a_data["position"] = len(ca_list) + 1
            ca_list.append(a_data)
            a_data = {}
        if elem.tag == 'PubmedArticle' and event == 'end':
            data["authors"] = ca_list
            data["keywords"] = kw_list
            data["xml_source"] = gz
            DBcall("papers", n).loadData([data])
            p_data = {}
            data = {}
            ca_list = []
            kw_list = []
            n += 1
    nn += 1
    print str(datetime.now())[:-7] + " : finished loading " + str(nn) + "th " + xmL
    # with open(xmL, 'rb') as f_in, gzip.open(xmL + '.gz', 'wb') as f_out:
    #     shutil.copyfileobj(f_in, f_out)
    os.unlink(xmL)
print str(datetime.now())[:-7] + " : done"
Пример #7
0
def main():
    flds = [
        "first_name", "last_name", "initials", "af_place", "af_country",
        "orcid", "email"
    ]
    print(str(datetime.now())[:-7] + " : extract data")
    keys = {"pubmed": {'$gt': 0}}
    # or findDatalistSorted(keys, "pubmed", 1).limit(1000)
    papers = DBcall("papers", 0).findDatalist(keys).limit(10000)
    authors_to_pubmed = {}
    pps = []
    for p in papers:
        pps.append(p["pubmed"])
        if len(p["authors"]) > 0:
            authors_to_pubmed[p["pubmed"]] = p["authors"]
            # positions = []
            # for p_a in p["authors"]:
            #     positions.append(p_a["positions"])
    # authors with their papers (more than 1 paper):
    print(str(datetime.now())[:-7] + " : start data linking")
    pubs_to_author = idAuthor(authors_to_pubmed, flds)
    """
    we might have got here a list of authors where some has different fields,
    but the same firs_name, last_name, initials:
    so let's merge authors by longest metadata if they are met under the same
    name (first, last, initials) in the same paper list (name+affiliation):
    the same metadata also can be RW differently being in different encoding
    """
    edges = []
    adata = dict(pubs_to_author)
    print(len(pubs_to_author.keys()))
    for nm, pbs in pubs_to_author.iteritems():
        for p in pbs:
            for c_nm, c_pbs in pubs_to_author.iteritems():
                # if the same p in this list of pprs assigned to
                # another author...
                if p in c_pbs and nm != c_nm:
                    # if metadata are different but name is the same:
                    nm_lst = nm.split(";")[:2]
                    c_nm_lst = c_nm.split(";")[:2]
                    if nm_lst == c_nm_lst:
                        if len(nm) >= len(c_nm):
                            try:
                                del adata[c_nm]
                            except:
                                continue
                        else:
                            try:
                                del adata[nm]
                            except:
                                continue

    a_to_pubs, authors_w_p = AuthorsToPubs(adata, flds)

    print(list(set(pubs_to_author.keys()) - set(adata.keys())))

    e_tuples = MakeGraphData(a_to_pubs)
    count_n = len(e_tuples.keys())
    aus = len(authors_w_p.keys())
    print(str(datetime.now())[:-7] + " : " + str(aus) + " authors")
    print(str(datetime.now())[:-7] + " : end linking")

    G = nx.Graph()
    nodes = {}
    for node, weight in e_tuples.iteritems():
        n1, n2 = node
        for nn in node:
            try:
                nodes[nn]
            except:
                G.add_node(nn)
                nodes[nn] = 1
        G.add_edge(n1, n2, weight=e_tuples[node])
Пример #8
0
def main():
    G = nx.Graph()
    flds = ["first_name", "last_name", "initials", "af_place", "af_country"]
    # papers = DBcall("papers", 0).findDatalistSorted(keys, "pubmed", 1).limit(1000)
    print str(datetime.now())[:-7] + " : extract data"
    keys = {"pubmed": {'$gt': 0}}
    papers = DBcall("papers", 0).findDatalist(keys).limit(100)
    sp = {}
    pps = []
    for p in papers:
        pps.append(p["pubmed"])
        if len(p["authors"]) > 0:
            sp[p["pubmed"]] = p["authors"]
    # authors with their papers (more than 1 paper):
    print str(datetime.now())[:-7] + " : start data linking"
    pubs_to_author = idAuthor(sp, flds)
    """
    we might have got here a list of authors where some has different fields,
    but the same firs_name, last_name, initials:
    merge authors by longest metadata if they are met under the same name
    (first, last name, initials) in the same paper list (name + affiliation):
    the same metadata also can be RW differently being in different encoding
    """
    edges = []
    adata = dict(pubs_to_author)
    print len(pubs_to_author.keys())
    for nm, pbs in pubs_to_author.iteritems():
        for p in pbs:
            for c_nm, c_pbs in pubs_to_author.iteritems():
                # if in the same paper in a list of paper assigned to another author...
                if p in c_pbs and nm != c_nm:
                    # if metadata are different but name is the same:
                    nm_lst = nm.split(";")[:2]
                    c_nm_lst = c_nm.split(";")[:2]
                    if nm_lst == c_nm_lst:
                        if len(nm) >= len(c_nm):
                            try:
                                del adata[c_nm]
                            except:
                                continue
                        else:
                            try:
                                del adata[nm]
                            except:
                                continue
    a_to_pubs = {}
    print len(adata.keys())
    # put author data in the DB with assigned id and list of papers
    for num in range(len(adata.keys())):
        adata_nums = {}
        adata_nums["assigned_id"] = num + 1
        meta = {}
        for f in range(len(flds)):
            meta[flds[f]] = adata.keys()[num].split(";")[f]
        adata_nums["meta"] = meta
        adata_nums["papers"] = adata[adata.keys()[num]]
        # print num, adata.keys()[num], adata[ adata.keys()[num] ]
        # to link all co-authors with assigned id in the paper:
        for p in adata[adata.keys()[num]]:
            try:
                a_to_pubs[p].append(adata_nums["assigned_id"])
            except:
                a_to_pubs[p] = [adata_nums["assigned_id"]]

        DBcall("authors", num).loadData([adata_nums])

    print list(set(pubs_to_author.keys()) - set(adata.keys()))

    keys = {"assigned_id": {'$gt': 0}}
    authors = DBcall("authors", 0).findDatalist(keys)
    e_tuples = {}
    for au in authors:
        # print nm, pbs
        for p in au["papers"]:
            all_a = a_to_pubs[p]
            for a in all_a:
                if a != au["assigned_id"]:
                    try:
                        e_tuples[a, au["assigned_id"]] += 1
                    except:
                        e_tuples[a, au["assigned_id"]] = 1

    for node, weight in e_tuples.iteritems():
        n1, n2 = node
        G.add_nodes_from(node)
        G.add_edge(n1, n2, weight=e_tuples[node])
    print "components: "
    print nx.connected_components(G)
    # print "degree: "
    # print nx.degree(G)
    # draw with node ids:
    nx.draw_networkx(G)
    # pos = nx.spring_layout(G)
    # nx.draw_networkx_nodes(G, pos=pos, nodelist=G.nodes())
    # nx.draw_networkx_edges(G, pos=pos, edgelist=G.edges())
    plt.show()
    # print list(set(sp.keys()) - set(a_to_pubs.keys()))
    # for pp, au in a_to_pubs.iteritems():
    #     print pp, au
    print str(datetime.now())[:-7] + " : " + str(len(
        a_to_pubs.keys())) + " authors adding:"
    print str(datetime.now())[:-7] + " : end linking"
Пример #9
0
                        date = datetime.strptime(
                            childs["Year"] + "-" + childs["Month"] + "-" +
                            childs["Day"], "%Y-%m-%d")
                    except:
                        day = int(childs["Day"]) - 2
                        date = datetime.strptime(
                            childs["Year"] + "-" + str(month) + "-" + str(day),
                            "%Y-%m-%d")
                        putLog(
                            "Date format was wrong for pid (pid = records number), corrected for -2 days",
                            n, "XML parcing" + gz, "format")
                    if v == "received":
                        data["received_date"] = date
                    if v == "accepted":
                        data["accepted_date"] = date
                if k == "IdType" and v == "pubmed" and elem.tag == "ArticleId":
                    data["pubmed"] = 0
                    if elem.text is not None:
                        pubmed = int(elem.text)
        if elem.tag == 'PubmedArticle' and event == 'end':
            keys = {"pubmed": pubmed}
            DBcall("papers", n).updateData(keys, data)
            data = {}
            pubmed = 0
            n += 1
    nn += 1
    print str(
        datetime.now())[:-7] + " : finished updateing " + str(nn) + "th " + xmL
    os.unlink(xmL)
print str(datetime.now())[:-7] + " : done"
Пример #10
0
nn = 0
for gz in sorted(gzips)[:110]:
    values = {}
    keys = {}
    xmL = xmLpath + gz
    with gzip.open(xmL, 'rb') as f:
        xmL = xmL[:-3]
        with open(xmL, "w") as ff:
            try:
                ff.write(f.read())
            except Exception, e:
                print "failed to unpack " + gz + "exception: " + e
                continue
    values["xml_source"] = gz
    print str(datetime.now())[:-7] + " : starting to update data from " + xmL
    for event, elem in etree.iterparse(xmL,
                                       events=('start', 'end', 'start-ns',
                                               'end-ns')):
        childs = {"Year": "1000", "Month": "1", "Day": "1"}
        if event == 'start':
            for k, v in elem.attrib.iteritems():
                if k == "IdType" and v == "pubmed":
                    keys["paper.pubmed"] = elem.text
        if elem.tag == 'PubmedArticle' and event == 'end':
            DBcall("papers", nn).updateData(keys, values)
    nn += 1
    print str(datetime.now())[:-7] + " : finished updating from " + str(
        nn) + "th " + xmL
    os.unlink(xmL)
print str(datetime.now())[:-7] + " : done"