Пример #1
0
def centralities(g, user_map):
    """Use graph_tool to calculate 7 centralities."""
    # degrees
    # in degree
    ki = g.degree_property_map('in')
    # out degree
    ko = g.degree_property_map('out')
    # weighted in degree
    si = g.degree_property_map('in', weight=g.ep['weight'])
    # weighted out degree
    so = g.degree_property_map('out', weight=g.ep['weight'])
    # pagerank
    pr = gt.pagerank(g)
    # betweetnness
    vb, eb = gt.betweenness(g)
    # eigenvector
    e, ev = gt.eigenvector(g)
    # screen_name
    screen_name = user_map.loc[g.vp['raw_id'].a.copy()].values
    df = pd.DataFrame(
        dict(screen_name=screen_name,
             in_degree=ki.a,
             out_degree=ko.a,
             weighted_in_degree=si.a,
             weighted_out_degree=so.a,
             page_rank=pr.a,
             betweenness=vb.a,
             eigenvector=ev.a))
    df.to_csv('centralities.raw.csv')
Пример #2
0
def graph_measures(graph: gt.Graph) -> pd.DataFrame:
    def get_attrs(attrs):
        return (attrs[1][0], attrs[1][1][1], attrs[0])

    def append_val(key, prop, v):
        measures[key][0].append(prop[v])

    _, vp_authority, vp_hub = gt.hits(graph)

    measures = {
        key: ([], prop)
        for key, prop in {
            'tp_group': graph.vp.group_name,
            'tp_author': graph.vp.username,
            'tn_degree_in': graph.degree_property_map('in'),
            'tn_degree_out': graph.degree_property_map('out'),
            'tn_degree_total': graph.degree_property_map('total'),
            'tn_pagerank': gt.pagerank(graph),
            'tn_betweenness': gt.betweenness(graph)[0],
            'tn_closeness': gt.closeness(graph),
            'tn_eigenvector': gt.eigenvector(graph)[1],
            'tn_authority': vp_authority,
            'tn_hub': vp_hub,
            'tn_lcc': gt.local_clustering(graph)
        }.items()
    }

    for attrs in product(graph.vertices(), measures.items()):
        append_val(*get_attrs(attrs))

    return pd.DataFrame(
        dict(map(lambda item: (item[0], item[1][0]),
                 measures.items()))).fillna(0)
Пример #3
0
    def findBestChilds(self,nodes,k = 4):
        stateGraph = gt.Graph()
        node_list = stateGraph.new_vertex_property("string")
        node_parents = stateGraph.new_edge_property("object")
        
        [self.addDirectedLink(node, nodes, stateGraph, node_list, node_parents) for node in nodes]

        try:
            #self.logger.debug (len(stateGraph))
            h = gt.pagerank(stateGraph)
            
            res = list(sorted(h, key=h.__getitem__, reverse=True))
            
            vertices = dict()
            for vertex in stateGraph.vertices():
                vertices[stateGraph.vertex_index[vertex]] = h[vertex]

            res = list(sorted(vertices, key=vertices.__getitem__, reverse=True))

            important = res[:k]
            important.remove(0)
            important.remove(1)
            important_v = set()
            [important_v.add(stateGraph.vertex(i)) for i in important]          
        except:
            self.logger.error ('Graph is empty')
            self.logger.error (sys.exc_info())
        
        dereffed_list = set([node_list[i] for i in important_v])
        dereffed_list.discard(0)
        dereffed_list.discard(1)
        return list(dereffed_list)
Пример #4
0
def calc_pagerank(g: gt.Graph) -> List[Tuple[int, str, float]]:
    """
    Return: sorted list of tuples, [(vertex_idx, wk_title, pagerank_value), ....]
    """
    vp_label = g.vp['_graphml_vertex_id']  # same as wktitle
    pr = gt.pagerank(g)
    ranks = [(g.vertex_index[v], vp_label[v], pr[v]) for v in g.vertices()]
    ranks = sorted(ranks, key=lambda e: -e[-1])
    return ranks
Пример #5
0
def add_pagerank():
    pr = gt.pagerank(g,damping=0.5)
    with open("all_AAN_with_fellows.csv","r") as old,\
        open("all_AAN_with_fellows_and_pagerank.csv", "w+") as new:
        header = old.next()
        new.write(header.strip() + ",pagerank\n")
        for line in old:
            old_data = line.strip().split(",")
            gt_index = int(old_data[0])
            new_data = line.strip() + "," + str(pr.a[gt_index]) + "\n"
            new.write(new_data)
Пример #6
0
def add_pagerank():
    g = gt.load_graph(
        "/home/mrunelov/KTH/exjobb/SICS-cite/APS/data/APS.graphml")
    pr = gt.pagerank(g, damping=0.5)
    with open("all_APS_with_fellows.csv","r") as old,\
        open("all_APS_with_fellows_and_pagerank.csv", "w+") as new:
        header = old.next()
        new.write(header.strip() + ",pagerank\n")
        for line in old:
            old_data = line.strip().split(",")
            gt_index = int(old_data[0])
            new_data = line.strip() + "," + str(pr.a[gt_index]) + "\n"
            new.write(new_data)
Пример #7
0
 def get_pagerank_values(self):
     start = time.time()
     logger.info('Started call to get_pagerank')
     g = Graph()
     vp = g.add_edge_list(self.__v.get_graph_edges(),
                          hashed=True,
                          hash_type='int')
     logger.info('Delta time to build graph: {}s'.format(
         timedelta(seconds=(time.time() - start))))
     start = time.time()
     ranks = pagerank(g)
     logger.info('Delta time to compute pagerank: {}s'.format(
         timedelta(seconds=(time.time() - start))))
     for vertex in g.vertices():
         qid = vp[vertex]
         r = ranks[vertex]
         yield qid, r
Пример #8
0
    def findBestChilds(self, nodes, k=4):
        stateGraph = gt.Graph()
        node_list = stateGraph.new_vertex_property("string")
        node_parents = stateGraph.new_edge_property("object")

        [
            self.addDirectedLink(node, nodes, stateGraph, node_list,
                                 node_parents) for node in nodes
        ]

        try:
            #self.logger.debug (len(stateGraph))
            h = gt.pagerank(stateGraph)

            res = list(sorted(h, key=h.__getitem__, reverse=True))

            vertices = dict()
            for vertex in stateGraph.vertices():
                vertices[stateGraph.vertex_index[vertex]] = h[vertex]

            res = list(sorted(vertices, key=vertices.__getitem__,
                              reverse=True))

            important = res[:k]
            important.remove(0)
            important.remove(1)
            important_v = set()
            [important_v.add(stateGraph.vertex(i)) for i in important]
        except:
            self.logger.error('Graph is empty')
            self.logger.error(sys.exc_info())

        dereffed_list = set([node_list[i] for i in important_v])
        dereffed_list.discard(0)
        dereffed_list.discard(1)
        return list(dereffed_list)
Пример #9
0
def pr_curves():
    plt.figure().set_facecolor('white')

    lra = logit()

    in_degs_gt = g.degree_property_map("in")
    in_degs = in_degs_gt.a.astype("float")
    in_degs = in_degs / in_degs.max()

    with open("burst_list.pickle", "rb") as f:
        bla = np.asarray(pickle.load(f)).astype("float")
        bla /= bla.max()

    with open("vpa-between2.pickle", "rb") as f:
        ba = np.asarray(pickle.load(f))
        ba /= ba.max()

    geometric_mean = bla
    geometric_mean *= ba
    geometric_mean = np.sqrt(geometric_mean)

    with open("Px_list.pickle", "rb") as f:
        pxa = np.asarray(pickle.load(f)).astype("float")
        pxa_n = pxa / pxa.max()

    with open("Px_list_weighted.pickle", "rb") as f:
        pxwa = np.asarray(pickle.load(f)).astype("float")
        pxwa_n = pxwa / pxwa.max()

    pr = gt.pagerank(g, damping=0.5)

    plots = [[] for _ in range(8)]
    print "Calculating for num_top = " + str(num_top)
    plots[0] = find_fellows_in_top_scores(in_degs,
                                          "indegree",
                                          num_top,
                                          printstuff=False)
    plots[1] = find_fellows_in_top_scores(pr.a,
                                          "PageRank alpha 0.5",
                                          num_top,
                                          printstuff=False)
    plots[2] = find_fellows_in_top_scores(ba,
                                          "betweenness",
                                          num_top,
                                          printstuff=False)
    plots[3] = find_fellows_in_top_scores(pxa_n,
                                          "progeny size",
                                          num_top,
                                          printstuff=False)
    plots[4] = find_fellows_in_top_scores(pxwa_n,
                                          "Weighted progeny size",
                                          num_top,
                                          printstuff=False)
    plots[5] = find_fellows_in_top_scores(lra,
                                          "All with logit coefficients",
                                          num_top,
                                          printstuff=False)
    plots[6] = find_fellows_in_top_scores(geometric_mean,
                                          "sqrt(between*burst)",
                                          num_top,
                                          printstuff=False)
    geometric_mean *= np.sqrt(in_degs)
    plots[7] = find_fellows_in_top_scores(geometric_mean,
                                          "sqrt(between*burst*indegs)",
                                          num_top,
                                          printstuff=False)

    #print "Plotting..."
    # Precision-Recall
    #for p in plots:
    #plt.plot(*zip(*p),linewidth=2.0)
    #ax = plt.subplot()
    #total_fellow_articles = len(fellow_indexes)
    ##total_fellow_articles = 139755 # manual total within 1980 filter
    #y_random = float(total_fellow_articles) / num_top #527130
    #ax.plot([0.0,1.0],[y_random,y_random],ls="--",c="0.5",linewidth=2.0)
    #leg = plt.legend([r'$\mathrm{Indegree}$', r'$\mathrm{Betweenness}$', r'$\mathrm{Backbone\/ progeny\/ size}$', r'$\mathrm{Logit}$', r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}}$',r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}\times\/\mathrm{indegree}}$',r'$\mathrm{PageRank,\/} \alpha=0.5$',r'$\mathrm{Random\/ retrieval}$'], loc='best',fontsize=18)
    #for obj in leg.legendHandles:
    #obj.set_linewidth(4.0)

    #plt.xlabel(r'$\mathrm{Recall}$',fontsize=24)
    #plt.ylabel(r'$\mathrm{Precision}$',fontsize=24)
    #plt.show()

    # Precision @ n
    lss = ["-"] * 8
    lss[0] = "--"
    lss[1] = "--"
    clr = ['b', 'k', 'g', 'r', 'r', 'c', 'm', 'y']
    if num_top <= 1000:
        xs = range(10, num_top + 1, 10)
    #plt.tick_params(labelsize=18)
    #plt.figure(2)
    #plt.title(r"$\mathrm{Precision\/ @\/ X}$")
    plt.title(r"$\mathrm{DCG\/ @\/ X}$")
    plt.figure().set_facecolor('white')
    if num_top <= 1000:
        plt.xlabel(r'$\mathrm{@}$', fontsize=32)
    else:
        plt.xlabel(r'$\mathrm{Recall}$', fontsize=32)
    plt.ylabel(r'$\mathrm{Precision}$', fontsize=32)
    #plt.ylabel(r'$\mathrm{DCG}$',fontsize=24)
    if use_cutoff:
        total_fellow_articles = 139755  # manual total within 1980 filter
        y_random = float(
            total_fellow_articles) / 427735  #527129 # 427735 is for cutoff
    else:
        total_fellow_articles = len(fellow_indexes)
        y_random = float(total_fellow_articles) / 527129

    i = 0
    for p in plots[:2]:
        if num_top > 1000:
            xs = [point[0] for point in p]
        #ys = [point[1] for point in p]
        ys = p  # for DCGs
        plt.plot(xs, ys, linewidth=2.0, ls=lss[i], color=clr[i])
        i += 1
    ax = plt.subplot()
    ax.tick_params(labelsize=18)
    if num_top <= 1000:
        ax.plot([0.0, num_top], [y_random, y_random],
                ls="--",
                c="0.5",
                linewidth=2.0)
    else:
        ax.plot([0.0, 1], [y_random, y_random],
                ls="--",
                c="0.5",
                linewidth=2.0)
    for p in plots[2:]:
        if num_top > 1000:
            xs = [point[0] for point in p]
        #ys = [point[1] for point in p]
        ys = p  # for DCGs
        if i == 4:
            plt.plot(xs,
                     ys,
                     linewidth=2.0,
                     ls=lss[i],
                     color=clr[i],
                     marker='D')
        else:
            plt.plot(xs, ys, linewidth=2.0, ls=lss[i], color=clr[i])
        i += 1
    leg = plt.legend([
        r'$\mathrm{Indegree}$', r'$\mathrm{PageRank,\/} \alpha=0.5$',
        r'$\mathrm{Random\/ retrieval}$', r'$\mathrm{Betweenness}$',
        r'$\mathrm{Backbone\/ progeny\/ size}$',
        r'$\mathrm{Weighted\/ backbone\/ progeny\/ size}$',
        r'$\mathrm{Logit}$',
        r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}}$',
        r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}\times\/\mathrm{indegree}}$'
    ],
                     loc='upper right',
                     fontsize=24)
    for obj in leg.legendHandles:
        obj.set_linewidth(2.0)
    plt.show()
Пример #10
0
def process(name, g):
    # Properties
    vp_pos = gt.sfdp_layout(g)
    vp_deg = g.degree_property_map('total')
    vp_deg_log = g.new_vp('double')
    vp_deg_log.a = np.log10(vp_deg.a)
    vp_cls = gt.closeness(g)
    vp_page = gt.pagerank(g)
    vp_btw, ep_btw = gt.betweenness(g, norm=False)

    # Colormaps
    for cmap in [
            'viridis', 'plasma', 'inferno', 'YlGnBu', 'Blues', 'Greys',
            'Greens', 'Oranges'
    ]:
        draw_graph(g,
                   vp_pos,
                   f'{name}.prop=deg.color={cmap}.png',
                   vp_color=vp_deg,
                   vcmap=cmap)
        draw_graph(g,
                   vp_pos,
                   f'{name}.prop=deg_log.color={cmap}.png',
                   vp_color=vp_deg_log,
                   vcmap=cmap)
        draw_graph(g,
                   vp_pos,
                   f'{name}.prop=cls.color={cmap}.png',
                   vp_color=vp_cls,
                   vcmap=cmap)
        draw_graph(g,
                   vp_pos,
                   f'{name}.prop=page.color={cmap}.png',
                   vp_color=vp_page,
                   vcmap=cmap)
        draw_graph(g,
                   vp_pos,
                   f'{name}.prop=btw.color={cmap}.png',
                   vp_color=vp_btw,
                   vcmap=cmap)

    # Construct dicts for D3-style JSON
    nodes = []
    for u in g.vertices():
        p = vp_pos[u]
        nodes.append({
            'x': p[0],
            'y': p[1],
            'deg': vp_deg[u],
            'deg_log': vp_deg_log[u],
            'cls': vp_cls[u],
            'page': vp_page[u],
            'btw': vp_btw[u],
        })

    vp_idx = g.vertex_index
    links = [{
        'source': vp_idx[e.source()],
        'target': vp_idx[e.target()],
    } for e in g.edges()]

    # Save D3 style JSON
    d = {'nodes': nodes, 'links': links}
    with open(f'{name}.json', 'w') as f:
        json.dump(d, f)
Пример #11
0
#import packages
import graph_tool.all as gt

#------------------------------------------------------
#Variables:
NETWORK_FILE = ""
NETWORK_FEATURE_FILE = ""
#------------------------------------------------------

#import the graph
g = gt.load_graph(NETWORK_FILE)
#calculate the features using inbuilt graph_tool functions

#Pagerank
rank = gt.pagerank(g)
print("pagerank has been calculated")

#HITS y-hubs and x-authorities
eigenvalue, xauthorities, yhubs = gt.hits(g)
print("HITS values have been calculated")

#betweenness centrality
between_vp, between_ep = gt.betweenness(g)
print("betweenness centrality has been calculated")

#save external to internal property map
#this makes the features accessible in the future when loading the graph
g.vertex_properties["page_rank"] = rank
g.vertex_properties["x_authorities"] = xauthorities
g.vertex_properties["y_hubs"] = yhubs
Пример #12
0
    def iterateMatrix(self, blacklist=set(), additionalRes=set(), kp=75):
        """Iteration phase,
        During this phase the children of the current bottom level nodes are fetched and added to the hashed set.
        
        **Parameters**
    
        blacklist : set, optional (default = empty)
            set of resources predicates to exclude from the pathfinding algorithm
        
        additionalResources : set, optional (default = empty)
            set of resources to include anyway in the next iteration
    
        **Returns**
        
        response : stateGraph
            contains the updated stategraph after fetching new resources
        """
        self.logger.info('--- NEW ITERATION ---')
        #self.logger.info ('Existing resources {0}'.format(str(len(self.resources.property_list()))))
        #self.logger.info ('Indexed resources by parents {0}'.format(str(len(self.resources_by_parent))))
        self.logger.info('Grandmother: {0}'.format(
            self.resources[self.stateGraph.vertex(0)]))
        self.logger.info('Grandfather: {0}'.format(
            self.resources[self.stateGraph.vertex(1)]))
        self.logger.info('--- --- ---')

        start = time.clock()
        prevResources = set()
        additionalResources = dict()
        i = 0
        for v in self.stateGraph.vertices():
            i += 1
            if not v in self.added and not v in self.unimportant:
                prevResources.add(self.resources[v])

        #print('unimportant')
        #print(len(self.unimportant))
        #print('previous')
        #print(len(prevResources))
        #print(prevResources)
        #print('new')
        #print(len(self.added - prevResources))
        #print(self.added)
        #print('added')
        #print(len(self.added))
        #print('total')
        #print(i)

        #self.worker.startQueue(self.resourceretriever.fetchResource, num_of_threads=32)

        #if len(additionalRes) == 0:

        #    for resource in prevResources:
        #        self.added.add(resource)
        #        item = [resource, additionalResources, blacklist]
        #        self.worker.queueFunction(self.resourceretriever.fetchResource, item)

        #    self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource)

        #else:
        #    self.logger.info('Special search iteration: Deep search')
        #    for resource in additionalRes:
        #        self.added.add(resource)
        #        item = [resource, additionalResources, blacklist]
        #        self.worker.queueFunction(self.resourceretriever.fetchResource, item)

        #    self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource)

        reqs = list()

        if len(additionalRes) == 0:

            for resource in prevResources:
                self.added.add(resource)
            for url in self.resourceretriever.genMultiUrls(prevResources):
                reqs.append(url)

        else:
            self.logger.info('Special search iteration: Deep search')
            for resource in additionalRes:
                self.added.add(resource)
            for url in self.resourceretriever.genMultiUrls(additionalRes):
                reqs.append(url)

        if len(reqs) > 0:
            resps = list()
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=2) as executor:
                for res in reqs:
                    # Start the load operations and mark each future with its URL
                    future_to_url = {
                        executor.submit(requests.get, url): url
                        for url in res['urls']
                    }
                    for future in concurrent.futures.as_completed(
                            future_to_url):
                        url = future_to_url[future]
                        try:
                            response = dict()
                            response['resources'] = res['resources']
                            response['results'] = future.result()
                            resps.append(response)
                        except Exception as exc:
                            self.logger.error('%r generated an exception: %s' %
                                              (url, exc))
                        else:
                            self.logger.debug('retrieved results for %r' %
                                              (url))
                #todo move http gets in threads vs async grequests

            self.worker.startQueue(self.resourceretriever.processMultiResource,
                                   num_of_threads=64)

            for rp in resps:
                #for rp in res['urls']:
                item = [
                    rp['resources'], rp['results'], self.resources_by_parent,
                    additionalResources, blacklist
                ]
                self.worker.queueFunction(
                    self.resourceretriever.processMultiResource, item)

            self.worker.waitforFunctionsFinish(
                self.resourceretriever.processMultiResource)

        #toAddResources = list(additionalResources.keys() - prevResources)
        #print('to add resources')
        #print(len(toAddResources))
        #toAddResources = filter(resourceretriever.isResource, toAddResources)

        gc.collect()

        #self.logger.info('Updated indexed resources with parents {0}'.format(str(len(self.resources_by_parent.list_properties()))))

        self.logger.info('Total resources: %s' % str(len(prevResources)))

        self.checked_resources += len(additionalResources)

        halt1 = time.clock()
        self.logger.info('resource gathering: %s' % str(halt1 - start))
        #print ('resource gathering: %s' % str(halt1 - start))
        #self.stateGraph = gt.Graph()
        #vlist = self.stateGraph.add_vertex(len(toAddResources))
        #[self.buildGraph(ri, self.stateGraph) for ri in ris]
        #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10,
        #           output_size=(800,800), output="two-nodes.pdf")

        [
            self.addDirectedLink(res, additionalResources, self.stateGraph)
            for res in prevResources
        ]

        halt2 = time.clock()
        self.logger.info('graph construction: %s' % str(halt2 - halt1))
        #print ('graph construction: %s' % str(halt2 - halt1))
        #For next iteration, e.g. if no path was found
        #Check for singular values to reduce dimensions of existing resources
        #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10,
        #           output_size=(800,800), output="two-nodes.pdf")
        #pathExists = self.graph.pathExists(self)
        #self.logger.debug('path exists: %s' % pathExists)
        self.logger.debug('current iteration: %s' % self.iteration)
        if self.iteration > 1:
            try:
                self.logger.info('reducing matrix')
                #print ('reducing matrix, max important nodes')
                #self.logger.debug (len(self.stateGraph))
                #k = np.int((1-np.divide(1,self.iteration))*500)
                #k = np.int((1-np.divide(1,self.iteration))*kp)
                k = int(kp * math.pow(1.2, self.iteration))
                #print (k)
                h = gt.pagerank(self.stateGraph)

                #h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07))
                #h = (nx.hits_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07))
                vertices = dict()
                for vertex in self.stateGraph.vertices():
                    vertices[self.stateGraph.vertex_index[vertex]] = h[vertex]
                #print(vertices)
                res = list(
                    sorted(vertices, key=vertices.__getitem__, reverse=True))
                #print (res)
                self.logger.debug(k)
                unimportant = res[k:]
                self.unimportant = set()
                for u in unimportant:
                    #Never delete grandmother and grandfather, even if they become insignificant
                    if u > 1:
                        self.unimportant.add(self.stateGraph.vertex(u))
                        #pass
                #print(self.unimportant)
                #self.stateGraph = resourceretriever_gt.removeUnimportantResources(unimportant, self.resources, self.stateGraph)
                halt3 = time.clock()
                self.logger.info('rank reducing: %s' % str(halt3 - halt2))
                #self.logger.info('Updated resources amount: %s' % str(len(self.stateGraph.vertices())))
                #print('Updated resources amount: %s' % str(len(self.stateGraph.vertices())))
                #print(len(self.unimportant))
            except:
                self.logger.error('Pathfinding reduction error')
                self.logger.error(sys.exc_info())
        else:
            self.logger.info('no rank reducing')

        self.logger.info('total %s' % str(time.clock() - start))
        self.logger.info('=== === ===')
        self.iteration += 1
        return self.stateGraph
Пример #13
0
def page_rank(g):
    # return gt_stats.vertex_hist(g, gt.pagerank(g))
    return gt.pagerank(g).get_array()
Пример #14
0
for i in range(1000):
    g.add_edge(g.vertex_index[random.randint(50, 90)],
               g.vertex_index[random.randint(50, 90)])

# v_prop = g.new_vertex_property("string")
# v_prop[g.vertex_index[0]] = 'fooxxx'
# v_prop[g.vertex_index[1]] = 'bar'
# v_prop[g.vertex_index[2]] = 'bazxxx'

# e_prop = g.new_edge_property("double")
# e_prop[g.vertex_index[0]] = 200
# e_prop[g.vertex_index[2]] = 0.04

# e_len = g.new_edge_property("double")
# e_len[e1] = 10
# e_len[e2] = 20

pr = gt.pagerank(g)
# for i in pr:
#     print (i)
graph_draw(g,
           vertex_fill_color=pr,
           vertex_font_size=2,
           vorder=pr,
           output_size=(800, 800),
           output="two-nodes.png")

vp, ep = gt.betweenness(g)
print(gt.central_point_dominance(g, vp))
            weight_map[e] = 1. * common_votes / len(dep1[5:])
            edges[(dep1[4],dep2[4])] = [weight_map[e], dep1, dep2] # adds for debuging

        except Exception, e:
            print str(e)


# conventional centrality analysis

# degree 
degree = g.degree_property_map('total', weight = weight_map)

# vertice betweeness
betweeness = gt.betweenness(g, weight = weight_map)

# closeness
closeness = gt.closeness(g, weight = weight_map)

# Katz
katz = gt.katz(g, weight = weight_map)

# Pagerank
pagerank = gt.pagerank(g, weight = weight_map)


metrics = ['name', 'diap', 'betweenness', 'closeness', 'degree', 'katz', 'pagerank']
df = pd.DataFrame(zip(vertex_to_name.values(), diap, degree.a.tolist(), betweeness[0].a.tolist(), closeness.a.tolist(), katz.a.tolist(), 
                      pagerank.a.tolist()), columns = metrics)

df.sort('pagerank', ascending=True)[:30]
Пример #16
0
def plot_random_metrics(metric):
    """
    Random mean + std plotting for indegree and PageRank
    """
    global g
    plots = []
    for i in range(1, 11):
        num = str(i)
        first = get_first(num)
        second = get_second(num)
        g1, g2 = get_gt_graphs(g, first, second)
        if metric == "indeg":
            m1 = g1.degree_property_map("in")
            m2 = g2.degree_property_map("in")
        elif metric == "pagerank":
            m1 = gt.pagerank(g1, damping=0.5)
            m2 = gt.pagerank(g2, damping=0.5)
        ys1 = find_fellows_in_top_scores(m1.a,
                                         metric,
                                         num_top,
                                         printstuff=False)
        ys1 = [p[1] for p in ys1]
        plots.append(ys1)
        ys2 = find_fellows_in_top_scores(m2.a,
                                         metric,
                                         num_top,
                                         printstuff=False)
        ys2 = [p[1] for p in ys2]
        plots.append(ys2)

    if metric == "pagerank":
        original = gt.pagerank(g, damping=0.5)
    elif metric == "indeg":
        original = g.degree_property_map("in")
    g_plot = find_fellows_in_top_scores(original.a,
                                        metric,
                                        num_top,
                                        printstuff=False)
    g_plot = [p[1] for p in g_plot]

    plt.figure()
    plt.title(r"$\mathrm{Precision\/ @\/ X}$")
    plt.figure().set_facecolor('white')
    #plt.xlabel(r'$\mathrm{Recall}$',fontsize=24)
    plt.xlabel(r'$\mathrm{@}$', fontsize=24)
    plt.ylabel(r'$\mathrm{Precision}$', fontsize=24)
    xs = range(10, num_top + 1, 10)
    #for i,p in enumerate(plots):
    #ys = p
    #plt.plot(xs,ys,linewidth=1.0,color='b',ls=":")

    if metric == "indeg":
        clr = 'b'
    elif metric == "pagerank":
        clr = 'k'
    plt.plot(xs, g_plot, color=clr, linewidth=4.0)
    plot_mean_std(xs, plots, color=clr, ls=':')
    #leg = plt.legend([r'$\mathrm{\/ progeny\/ size}$', r'$\mathrm{Backbone\/ progeny\/ size}$', r'$\mathrm{Random\/ retrieval}$'],fontsize=18,loc='best')
    ax = plt.subplot()
    if use_cutoff:
        total_fellow_articles = 139755  # manual total within 1980 filter
        y_random = float(total_fellow_articles) / 427735
    else:
        total_fellow_articles = len(fellow_indexes)
        y_random = float(total_fellow_articles) / 527129
    ax.plot([0.0, num_top], [y_random, y_random],
            ls="--",
            c="0.5",
            linewidth=4.0)
    if metric == "pagerank":
        leg = plt.legend([
            r'$\mathrm{PageRank,\/} \alpha=0.5$',
            r'$\mathrm{PageRank\/ random\/ mean,\/} \alpha=0.5$',
            r'$\mathrm{Random\/ retrieval}$'
        ],
                         fontsize=32,
                         loc='lower right')
    elif metric == "indeg":
        leg = plt.legend([
            r'$\mathrm{Indegree}$', r'$\mathrm{Indegree\/ random\/ mean}$',
            r'$\mathrm{Random\/ retrieval}$'
        ],
                         fontsize=32,
                         loc='lower right')
    for obj in leg.legendHandles:
        obj.set_linewidth(4.0)
    plt.show()
Пример #17
0
             element='step',
             stat='probability',
             common_norm=False,
             ax=axarr[0])
axarr[0].set_title('Betweenness centrality of generic/non-generic genes')
axarr[0].set_xscale('log')
sns.boxplot(data=bw_df, y='betweenness', x='is_generic', ax=axarr[1])
axarr[1].set_title('Betweenness centrality of generic/non-generic genes')
axarr[1].set_yscale('log')

# In[11]:

# analyze pagerank centrality for generic vs. other genes
# pagerank treats edge weights as "importance"/"confidence" rather than cost,
# so we can use the original correlations as edge weights here
pr = gt.pagerank(G, weight=G.ep['weight'])
pr_df = (betweenness_to_df(G, pr).rename(columns={'betweenness': 'pagerank'}))
pr_df.head()

# In[12]:

sns.set({'figure.figsize': (12, 4)})
sns.set_style('whitegrid')
fig, axarr = plt.subplots(1, 2)
sns.histplot(data=pr_df,
             x='pagerank',
             hue='is_generic',
             element='step',
             stat='probability',
             common_norm=False,
             ax=axarr[0])
Пример #18
0
    def iterateMatrix(self, blacklist=set(), additionalRes = set(),kp=75):
        """Iteration phase,
        During this phase the children of the current bottom level nodes are fetched and added to the hashed set.
        
        **Parameters**
    
        blacklist : set, optional (default = empty)
            set of resources predicates to exclude from the pathfinding algorithm
        
        additionalResources : set, optional (default = empty)
            set of resources to include anyway in the next iteration
    
        **Returns**
        
        response : stateGraph
            contains the updated stategraph after fetching new resources
        """
        self.logger.info ('--- NEW ITERATION ---')
        #self.logger.info ('Existing resources {0}'.format(str(len(self.resources.property_list()))))
        #self.logger.info ('Indexed resources by parents {0}'.format(str(len(self.resources_by_parent))))
        self.logger.info ('Grandmother: {0}'.format(self.resources[self.stateGraph.vertex(0)]))
        self.logger.info ('Grandfather: {0}'.format(self.resources[self.stateGraph.vertex(1)]))
        self.logger.info ('--- --- ---')
        
        start = time.clock()
        prevResources = set()
        additionalResources = dict()
        i = 0
        for v in self.stateGraph.vertices():
            i += 1
            if not v in self.added and not v in self.unimportant:
                prevResources.add(self.resources[v])
        
        #print('unimportant') 
        #print(len(self.unimportant))       
        #print('previous')
        #print(len(prevResources))
        #print(prevResources)
        #print('new')
        #print(len(self.added - prevResources))
        #print(self.added)
        #print('added')
        #print(len(self.added))
        #print('total')
        #print(i)
        
        #self.worker.startQueue(self.resourceretriever.fetchResource, num_of_threads=32)
        
        #if len(additionalRes) == 0: 
            
        #    for resource in prevResources:
        #        self.added.add(resource)
        #        item = [resource, additionalResources, blacklist]
        #        self.worker.queueFunction(self.resourceretriever.fetchResource, item)
            
        #    self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource)
        
        #else:
        #    self.logger.info('Special search iteration: Deep search')
        #    for resource in additionalRes:
        #        self.added.add(resource)
        #        item = [resource, additionalResources, blacklist]
        #        self.worker.queueFunction(self.resourceretriever.fetchResource, item)
                
        #    self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource)
            
        
        
        reqs = list()
        
        if len(additionalRes) == 0: 
            
            for resource in prevResources:
                self.added.add(resource)
            for url in self.resourceretriever.genMultiUrls(prevResources):
                reqs.append(url)
                        
        else:
            self.logger.info('Special search iteration: Deep search')
            for resource in additionalRes:
                self.added.add(resource)
            for url in self.resourceretriever.genMultiUrls(additionalRes):
                reqs.append(url)
        
        if len(reqs) > 0: 
            resps = list()
            with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
                for res in reqs:
                    # Start the load operations and mark each future with its URL
                    future_to_url = {executor.submit(requests.get, url): url for url in res['urls']}
                    for future in concurrent.futures.as_completed(future_to_url):
                        url = future_to_url[future]
                        try:
                            response = dict()
                            response['resources'] = res['resources']
                            response['results'] = future.result()
                            resps.append(response)
                        except Exception as exc:
                            self.logger.error('%r generated an exception: %s' % (url, exc))
                        else:
                            self.logger.debug('retrieved results for %r' % (url))
                #todo move http gets in threads vs async grequests
                
            self.worker.startQueue(self.resourceretriever.processMultiResource, num_of_threads=64)    
            
            for rp in resps:
            #for rp in res['urls']:
                item = [rp['resources'], rp['results'], self.resources_by_parent, additionalResources, blacklist]
                self.worker.queueFunction(self.resourceretriever.processMultiResource, item)    
            
            self.worker.waitforFunctionsFinish(self.resourceretriever.processMultiResource)
        
        #toAddResources = list(additionalResources.keys() - prevResources) 
        #print('to add resources')
        #print(len(toAddResources))
        #toAddResources = filter(resourceretriever.isResource, toAddResources)
        
        gc.collect()
        
        #self.logger.info('Updated indexed resources with parents {0}'.format(str(len(self.resources_by_parent.list_properties()))))    
            
        self.logger.info ('Total resources: %s' % str(len(prevResources)))

        self.checked_resources += len(additionalResources)
            
        halt1 = time.clock()
        self.logger.info ('resource gathering: %s' % str(halt1 - start))
        #print ('resource gathering: %s' % str(halt1 - start))
        #self.stateGraph = gt.Graph()
        #vlist = self.stateGraph.add_vertex(len(toAddResources))
        #[self.buildGraph(ri, self.stateGraph) for ri in ris]
        #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10,
        #           output_size=(800,800), output="two-nodes.pdf")
        
        [self.addDirectedLink(res, additionalResources, self.stateGraph) for res in prevResources]
                    
        halt2 = time.clock()
        self.logger.info ('graph construction: %s' % str(halt2 - halt1))
        #print ('graph construction: %s' % str(halt2 - halt1))
        #For next iteration, e.g. if no path was found
        #Check for singular values to reduce dimensions of existing resources
        #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10,
        #           output_size=(800,800), output="two-nodes.pdf")
        #pathExists = self.graph.pathExists(self)
        #self.logger.debug('path exists: %s' % pathExists)
        self.logger.debug('current iteration: %s' % self.iteration)
        if self.iteration > 1:
            try:
                self.logger.info ('reducing matrix')
                #print ('reducing matrix, max important nodes')
                #self.logger.debug (len(self.stateGraph))
                #k = np.int((1-np.divide(1,self.iteration))*500)
                #k = np.int((1-np.divide(1,self.iteration))*kp)
                k = int(kp*math.pow(1.2,self.iteration))
                #print (k)
                h = gt.pagerank(self.stateGraph)

                #h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07))
                #h = (nx.hits_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07))
                vertices = dict()
                for vertex in self.stateGraph.vertices():
                    vertices[self.stateGraph.vertex_index[vertex]] = h[vertex]
                #print(vertices)
                res = list(sorted(vertices, key=vertices.__getitem__, reverse=True))
                #print (res)
                self.logger.debug(k)
                unimportant = res[k:]
                self.unimportant = set()
                for u in unimportant:
                    #Never delete grandmother and grandfather, even if they become insignificant
                    if u > 1:
                        self.unimportant.add(self.stateGraph.vertex(u))
                        #pass
                #print(self.unimportant)
                #self.stateGraph = resourceretriever_gt.removeUnimportantResources(unimportant, self.resources, self.stateGraph)            
                halt3 = time.clock()
                self.logger.info ('rank reducing: %s' % str(halt3 - halt2))
                #self.logger.info('Updated resources amount: %s' % str(len(self.stateGraph.vertices())))
                #print('Updated resources amount: %s' % str(len(self.stateGraph.vertices())))
                #print(len(self.unimportant))
            except:
                self.logger.error ('Pathfinding reduction error')
                self.logger.error (sys.exc_info())
        else:
            self.logger.info ('no rank reducing')
        
        self.logger.info ('total %s' % str(time.clock()-start))
        self.logger.info ('=== === ===')
        self.iteration+=1
        return self.stateGraph
Пример #19
0
def compute_and_save_pagerank(g, filename):
    pr = gt.pagerank(g, epsilon=1e-12)
    g.vertex_properties["pagerank"] = pr
    g.save(filename)
from graph_tool.all import *
from graph_tool.draw import graphviz_draw as gd
import graph_tool.all as gt
import gc
import math

G = load_graph("graphDumpFor25BlocksTopPointFivePercent_1.2_16122016.dot")
pageRank = G.new_vertex_property("double")
G.vertex_properties["pageRank"] = pageRank
valuation = G.ep.TransactionValues
widthTrans = valuation.copy()
pageRank = gt.pagerank(G)
#gt.remove_parallel_edges(G)
gt.remove_self_loops(G)
pos = gt.sfdp_layout(G)

for e in G.edges():
    if (widthTrans[e] != 0):
        widthTrans[e] = int(math.log10(float(widthTrans[e]))/7)
graph_draw(G, pos = pos,vertex_size=pageRank,edge_pen_width=widthTrans,output="graphDumpFor25BlocksTopPointFivePercent_1.2_16122016.pdf")
Пример #21
0
## out-degree
degrees = g.get_out_degrees(g.get_vertices())

plt.title("out-degree distribution")
plt.ylabel('#Nodes')
plt.xlabel('#Connections')
plt.plot(distribution(degrees))
plt.savefig(f"img/out_degree_dist.png", format='png')
plt.close()

del degrees

########
# Rank #
########
rank = GT.pagerank(g).get_array()

plt.title("Rank distribution")
plt.ylabel('#Nodes')
plt.xlabel('Rank')
plt.bar(*float_distribution(rank, 40), width=(max(rank)-min(rank))/50)
plt.savefig(f"img/rank_dist.png", format='png')
plt.close()

print(f"top {TOP} rank nodes: {get_top(rank , TOP)}")
del rank

###############
# Betweenness #
###############
betweenness = GT.betweenness(g)[0].get_array()
def central():
	g = gt.load_graph(filename)
	print 'Graph loaded, now calculating centrality'
	pr = gt.pagerank(g)
	g.vp['rank'] = pr
	g.save(filename)