Пример #1
0
def graph_characteristics(graphName):
    '''
    Return all the characteristics of a graph that we present in the paper.
    '''
    storedFolder = roles.graph_folder(graphName)

    inGraph = graph_analysis.IO.load_data("../Data/Graphs/" + storedFolder +
                                          "/" + graphName +
                                          ".GT.graph").next()
    groupTaxa, blackList = roles.graph_node_clusters(graphName,
                                                     inGraph,
                                                     metric="default")
    res = []

    headers = [
        "Graph Name", "\#Nodes", "\#Edges", "Edge density", "Clustering Coef.",
        "Diameter", "Role Taxonomy", "\#Clusters"
    ]
    res.append(paper_graph_name(graphName))
    res.append(inGraph.num_vertices())
    res.append(inGraph.num_edges())
    res.append(
        round(inGraph.num_edges() / (2 * float(inGraph.num_vertices())), 2))
    res.append(round(clustering.global_clustering(inGraph)[0], 3))
    res.append(topology.pseudo_diameter(inGraph)[0])
    res.append(graphRoles[graphName])
    res.append(len(set(groupTaxa)))
    return res, headers
Пример #2
0
def metrics(file, use_cache=True):
    # use cache or recompute
    cache = os.path.splitext(file)[0] + ".json"
    if use_cache and os.path.isfile(cache):
        print('using cached metrics for', os.path.basename(file))
        with open(cache, "r") as fp:
            return json.load(fp)
    print('computing metrics for', os.path.basename(file))

    # read file
    g = load_graph(file)
    degrees = list(g.degree_property_map("out"))
    with open(file) as f:
        metalines = [next(f) for x in range(13)]

    # gather data
    metrics = {}
    metrics['file'] = os.path.basename(file)
    metrics['edges'] = int(metalines[5].split()[-1])
    metrics['rounds'] = int(metalines[1].split()[-1])
    metrics['max_degree'] = max(degrees)
    metrics['avg_degree'] = mean(degrees)
    metrics['min_degree'] = min(degrees)
    metrics['local_clustering'] = mean(local_clustering(g).get_array())
    metrics['global_clustering'] = global_clustering(g)[0]
    metrics['pseudo_diameter'] = int(pseudo_diameter(g)[0])
    fit = powerlaw.Fit(degrees, discrete=True, verbose=False)
    metrics['exponent'] = fit.alpha
    metrics['KS'] = fit.power_law.KS()
    metrics['x_min'] = fit.xmin

    with open(cache, "w") as fp:
        json.dump(metrics, fp)

    return metrics
Пример #3
0
def stats(g, name,rankfile=None):
	print('***** ' + name + ' *****')
	print('Directed: ', g.is_directed())
	print('v count entire graph: ', g.num_vertices())
	comp,hist = graph_tool.topology.label_components(g)
	print('Num  C.C.: ', len(hist))
	print('Largest C.C.: ', max(hist))
	g = get_largest_cc(g)
	print('>>>>> Only largest C.C.')

	print('v count: ', g.num_vertices())
	print('e count: ', g.num_edges())
	print('pseudo-diameter: ', gt.pseudo_diameter(g)[0])
	print('density: ', get_density(g))
	print('global clust: ', graph_tool.clustering.global_clustering(g)[0])
	deg = [x.out_degree() for x in g.vertices()]
	deg = sorted(deg)

	print('mean deg+: ', np.mean(deg))
	print('std deg+: ', np.std(deg))
	print('min deg+: ', deg[0])
	print('max deg+: ', deg[-1])
	plot_ccdf(deg)
	if rankfile is not None:
		rank(rankfile)
Пример #4
0
def f_pseudo_diameter( D, stats, options={ 'features': [] } ):
    """"""

    LC = label_largest_component(D)
    LCD = GraphView( D, vfilt=LC )

    if 'diameter' in options['features']:
        if LCD.num_vertices() == 0 or LCD.num_vertices() == 1:
            # if largest component does practically not exist, use the whole graph
            dist, ends = pseudo_diameter(D)
        else:
            dist, ends = pseudo_diameter(LCD)

        stats['pseudo_diameter']=dist
        # D may be used in both cases
        stats['pseudo_diameter_src_vertex']=D.vertex_properties['name'][ends[0]]
        stats['pseudo_diameter_trg_vertex']=D.vertex_properties['name'][ends[1]]
        log.debug( 'done pseudo_diameter' )
Пример #5
0
def create_graph(N=100, nb_clusters=4):
    from graph_tool.topology import label_largest_component, pseudo_diameter
    is_connected = False
    nb_iter = 0
    while not is_connected and nb_iter < N:
        cexp.fast_random_graph(N, .05)
        g = cexp.to_graph_tool()
        is_connected = label_largest_component(g).a.sum() == N
    cexp.turn_into_signed_graph_by_propagation(nb_clusters, .8)
    return g, int(pseudo_diameter(g)[0])
Пример #6
0
def clustering_tables(graphName, strategy=None):
    '''
    Create the data for the two tables of Beta_CV and C-Index.
    '''
    inGraph, groupTaxa, blackList, xTickMarks = cmp.prepare_input_graph(
        graphName, metric="default")
    gDiameter = topology.pseudo_diameter(inGraph)[0]
    ranks = False

    if blackList != None: print "BlackListed = " + str(blackList)
    allBetaCV = []
    allCIndex = []
    #     for methodName in ["roleSim", "simRank", "heatSim", "spectralSim"]:
    for methodName in ["spectralSim"]:
        print methodName
        if cmp.is_spectral(methodName):
            energies = int(gDiameter)
            methodParams = [energies, 10, strategy]
        else:  #non Spectra methods
            methodParams = "default"

        if graphName in ["E_Coli", "Carribean_FoodWeb", "Mapk"
                         ] and methodName in ["heatSim", "heatSim_PP"]:
            distMatrix = cmp.execute_method(methodName,
                                            inGraph,
                                            graphName,
                                            distances=True,
                                            ranks=ranks,
                                            distFunction="canberra",
                                            methodParams=methodParams)
        else:
            distMatrix = cmp.execute_method(methodName,
                                            inGraph,
                                            graphName,
                                            distances=True,
                                            ranks=ranks,
                                            distFunction="default",
                                            methodParams=methodParams)

        allBetaCV.append(
            myUtils.inner_intra_distances(distMatrix,
                                          groupTaxa,
                                          blackList,
                                          ranks=ranks))
        allCIndex.append(
            myUtils.clustering_c_index(distMatrix, groupTaxa, blackList))

    #transform values relative to the second worst
    secondWorse = sorted(allBetaCV)[-2]
    relativeBetas = [((secondWorse - i) / float(i)) * 100 for i in allBetaCV]

    secondWorse = sorted(allCIndex)[-2]
    relativeCs = [((secondWorse - i) / float(i)) * 100 for i in allCIndex]

    return allBetaCV, relativeBetas, allCIndex, relativeCs
Пример #7
0
def graph_diameter(graph):
    largest_connected_component = graph_lcc(graph)
    return int(pseudo_diameter(largest_connected_component)[0])
Пример #8
0
def diameter(g,
             directed=None,
             weights=None,
             combine_weights="mean",
             is_connected=False):
    '''
    Returns the diameter of the graph.

    .. versionchanged:: 2.3
        Added `combine_weights` argument.

    .. versionchanged:: 2.0
        Added `directed` and `is_connected` arguments.

    It returns infinity if the graph is not connected (strongly connected for
    directed graphs) unless `is_connected` is True, in which case it returns
    the longest existing shortest distance.

    Parameters
    ----------
    g : :class:`~nngt.Graph`
        Graph to analyze.
    directed : bool, optional (default: ``g.is_directed()``)
        Whether to compute the directed diameter if the graph is directed.
        If False, then the graph is treated as undirected. The option switches
        to False automatically if `g` is undirected.
    weights : bool or str, optional (default: binary edges)
        Whether edge weights should be considered; if ``None`` or ``False``
        then use binary edges; if ``True``, uses the 'weight' edge attribute,
        otherwise uses any valid edge attribute required.
    combine_weights : str, optional (default: 'mean')
        How to combine the weights of reciprocal edges if the graph is directed
        but `directed` is set to False. It can be:

        * "sum": the sum of the edge attribute values will be used for the new
          edge.
        * "mean": the mean of the edge attribute values will be used for the
          new edge.
        * "min": the minimum of the edge attribute values will be used for the
          new edge.
        * "max": the maximum of the edge attribute values will be used for the
          new edge.
    is_connected : bool, optional (default: False)
        If False, check whether the graph is connected or not and return
        infinite diameter if graph is unconnected. If True, the graph is
        assumed to be connected.

    See also
    --------
    :func:`nngt.analysis.shortest_distance`

    References
    ----------
    .. [gt-diameter] :gtdoc:`topology.pseudo_diameter`
    '''
    g, graph, w = _get_gt_graph(g,
                                directed,
                                weights,
                                combine_weights,
                                return_all=True)

    w = _get_gt_weights(g, w)

    # first check whether the graph is fully connected
    ctype = "scc" if directed else "wcc"

    if not is_connected:
        cc, hist = connected_components(g, ctype)

        if len(hist) > 1:
            return np.inf

    return gtt.pseudo_diameter(graph, weights=w)[0]
Пример #9
0
def get_descriptors(network, short_name, nx_network, already_calculated=False):
    def _prefixToTitle(prefix):
        if prefix == 'a':
            return "Artists"
        elif prefix == 't':
            return "Tags"
        elif prefix == 'u':
            return 'Users'

    filename = "cache/{}.pickle".format(short_name)
    if os.path.isfile(filename):
        result = pickle.load(open(filename, 'rb'))
        return result

    result = {}
    prefix1, prefix2 = short_name[0], short_name[1]
    t1 = _prefixToTitle(prefix1)
    t2 = _prefixToTitle(prefix2)
    result['name'] = short_name
    result['title_dd1'] = PLOT_TITLES[short_name].format(t1, "")
    result['title_dd2'] = PLOT_TITLES[short_name].format(t2, "")
    result['title_dd1_acum'] = PLOT_TITLES[short_name].format(
        t1, " Cumulative")
    result['title_dd2_acum'] = PLOT_TITLES[short_name].format(
        t2, " Cumulative")
    result['title_wd'] = PLOT_TITLES['wd'].format("", t1, t2)
    result['title_wd_acum'] = PLOT_TITLES['wd'].format("Cumulative ", t1, t2)
    result['title_cd'] = PLOT_TITLES['cd'].format(t1, t2)
    result['title_sp'] = PLOT_TITLES['sp'].format(t1, t2)
    result['filename_dd'] = '{}_dd'.format(short_name)  # degree input dist
    result['filename_ddl'] = '{}_dd_log'.format(
        short_name)  # degree dist (log)
    result['filename_dd1'] = '{}_{}_dd'.format(short_name[0],
                                               short_name)  # degree input dist
    result['filename_dd2'] = '{}_{}_dd'.format(short_name[1],
                                               short_name)  # degree input dist
    result['filename_dd1l'] = '{}_{}_dd_log'.format(
        short_name[0], short_name)  # degree input dist
    result['filename_dd2l'] = '{}_{}_dd_log'.format(
        short_name[1], short_name)  # degree input dist
    result['filename_dd1_acum'] = '{}_{}_dd_acum'.format(
        short_name[0], short_name)  # degree input dist
    result['filename_dd2_acum'] = '{}_{}_dd_acum'.format(
        short_name[1], short_name)  # degree input dist
    result['filename_wd'] = '{}_wd'.format(short_name)  # weight distribution
    result['filename_wdl'] = '{}_wd_log'.format(
        short_name)  # weight distribution
    result['filename_wd_acum'] = '{}_wd_acum'.format(
        short_name)  # weight distribution
    result['filename_sp'] = '{}_sp'.format(short_name)  # shortest path
    result['filename_cd'] = '{}_cd'.format(short_name)  # components
    result['filename_cdl'] = '{}_cd_log'.format(short_name)  #

    nodes = network.get_vertices()
    edges = network.get_edges()
    result['num_nodes'] = {}
    result['num_nodes']['total'] = nodes.shape[0]

    result['num_edges'] = edges.shape[0]

    result['degree'] = {"total": {}, "prefix1": {}, "prefix2": {}}
    result['degree']["total"]['max'] = network.get_out_degrees(nodes).max()
    result['degree']["total"]['min'] = network.get_out_degrees(nodes).min()
    result['degree']["total"]['avg'] = network.get_out_degrees(nodes).mean()
    result['degree']["total"]["counts"], result['degree']["total"][
        "bins"] = st.vertex_hist(network, "out")

    nodes1, nodes2 = [], []
    for node in nodes:
        if prefix1 in network.vp['id'][node]:
            nodes1.append(node)
        elif prefix2 in network.vp['id'][node]:
            nodes2.append(node)

    result['num_nodes']['prefix1'] = len(nodes1)
    result['degree']["prefix1"]['max'] = network.get_out_degrees(nodes1).max()
    result['degree']["prefix1"]['min'] = network.get_out_degrees(nodes1).min()
    result['degree']["prefix1"]['avg'] = network.get_out_degrees(nodes1).mean()
    result['degree']["prefix1"]["counts"], result['degree']["prefix1"][
        "bins"] = np.histogram(
            network.get_out_degrees(nodes1),
            bins=15)  # result['degree']["total"]["bins"].shape[0]
    result['degree']["prefix1"]["d"] = network.get_out_degrees(
        nodes1)  # result['degree']["total"]["bins"].shape[0]
    if prefix1 == prefix2:
        nodes2 = nodes1
    result['num_nodes']['prefix2'] = len(nodes2)
    result['degree']["prefix2"]['max'] = network.get_out_degrees(nodes2).max()
    result['degree']["prefix2"]['min'] = network.get_out_degrees(nodes2).min()
    result['degree']["prefix2"]['avg'] = network.get_out_degrees(nodes2).mean()
    result['degree']["prefix2"]["counts"], result['degree']["prefix2"][
        "bins"] = np.histogram(network.get_out_degrees(nodes2), bins=15)
    result['degree']["prefix2"]["d"] = network.get_out_degrees(
        nodes2)  # result['degree']["total"]["bins"].shape[0]

    result['weights'] = {}
    weights = []

    for v1, v2 in nx_network.edges():
        weight = nx_network.get_edge_data(v1, v2)['weight']
        weights.append(weight)

    # result['weights']['counts'], result['weights']['bins'] = np.histogram(weights, bins=8)
    result['weights']['d'] = weights

    # estimated diamater  and longest path
    d, (v1, v2) = top.pseudo_diameter(network)
    result['diameter'] = d
    d_path = "{}-{}".format(network.vp['id'][v1], network.vp['id'][v2])
    result['diameter_path'] = d_path

    result['clustering'] = clu.global_clustering(network)

    if not already_calculated:
        net2 = gt.Graph(network)  # undirected version
        net2.set_directed(False)
        result['sp'] = {}
        result['sp']['counts'], result['sp']['bins'] = shortest_paths(net2)
        # connected components

        _, c2 = top.label_components(net2)

        result['components'] = {}
        result['components']['num'] = len(c2)
        result['components']['bins'] = range(len(c2))
        result['components']['counts'] = c2

    pickle.dump(result, open(filename, "wb"))
    return result
Пример #10
0
currentMethods = [
    "panos_sim", "sim_rank", "role_sim", "vertex_sim", "refex_sim",
    "commute_time_dist"
]

if __name__ == '__main__':

    for graphName in currentGraphs:
        print_fancy(graphName)
        v_measAllMethods = []
        for methodName in currentMethods:

            print "\n\n"
            inGraph, trueClusters, blackList, xTickMarks = prepare_input_graph(
                graphName, metric="default", verbose=False)
            graphDiameter = int(topology.pseudo_diameter(inGraph)[0])
            trueClustersSize = len(np.unique(trueClusters))

            if methodName.startswith("panos"):
                methodParams = ["all", "small", False, None]
#                 print methodParams
            else:
                methodParams = None

            try:
                if methodParams == None:
                    res = load_data("../Output/" + methodName + "_" +
                                    graphName).next()
                else:
                    inFile = "../Output/" + methodName + "_" + '_'.join(
                        str(e) for e in methodParams) + "_" + graphName
Пример #11
0
    def evaluate_sampling(self, full_graph: Graph, sampled_graph: Graph,
                          full_partition: BlockState,
                          sampled_graph_partition: BlockState,
                          block_mapping: Dict[int, int],
                          vertex_mapping: Dict[int,
                                               int], assignment: np.ndarray):
        """Evaluates the goodness of the samples.

        Parameters
        ----------
        full_graph : Graph
            the full, unsampled Graph object
        sampled_graph : Graph
            the sampled graph
        full_partition : Partition
            the partitioning results on the full graph
        sampled_graph_partition : Partition
            the partitioning results on the sampled graph
        block_mapping : Dict[int, int]
            the mapping of blocks from the full graph to the sampled graph
        vertex_mapping : Dict[int, int]
            the mapping of vertices from the full graph to the sampled graph
        assignment : np.ndarray[int]
            the true vertex-to-community mapping
        """
        #####
        # General
        #####
        self.sampled_graph_num_vertices = sampled_graph.num_vertices()
        self.sampled_graph_num_edges = sampled_graph.num_edges()
        self.blocks_retained = sampled_graph_partition.get_B(
        ) / full_partition.get_B()
        # pseudo_diameter returns a tuple: (diameter, (start_vertex, end_vertex))
        self.sampled_graph_diameter = pseudo_diameter(sampled_graph)[0]
        self.full_graph_diameter = pseudo_diameter(full_graph)[0]
        for vertex in sampled_graph.vertices():
            if (vertex.in_degree() + vertex.out_degree()) == 0:
                self.sampled_graph_island_vertices += 1
        self.sampled_graph_largest_component = extract_largest_component(
            sampled_graph, directed=False).num_vertices()
        self.full_graph_largest_component = extract_largest_component(
            full_graph, directed=False).num_vertices()

        ######
        # Expansion quality (http://portal.acm.org/citation.cfm?doid=1772690.1772762)
        ######
        # Expansion factor = Neighbors of sample / size of sample
        # Maximum expansion factor = (size of graph - size of sample) / size of sample
        # Expansion quality = Neighbors of sample / (size of graph - size of sample)
        # Expansion quality = 1 means sample is at most 1 edge away from entire graph
        sampled_graph_vertices = set(vertex_mapping.keys())
        neighbors = set()
        for vertex in sampled_graph_vertices:
            for neighbor in full_graph.get_out_neighbors(vertex):
                neighbors.add(neighbor)
        neighbors = neighbors - sampled_graph_vertices
        self.expansion_quality = len(neighbors) / (
            full_graph.num_vertices() - sampled_graph.num_vertices())

        ######
        # Clustering coefficient
        ######
        self.sampled_graph_clustering_coefficient = global_clustering(
            sampled_graph)[0]
        self.full_graph_clustering_coefficient = global_clustering(
            full_graph)[0]

        ######
        # Info on communities
        ######
        self.get_community_details(
            assignment,
            full_partition.get_blocks().get_array(),
            sampled_graph_partition.get_blocks().get_array(), vertex_mapping)

        if np.unique(
                assignment
        ).size == 1:  # Cannot compute below metrics if no true partition is provided
            return

        #####
        # % difference in ratio of within-block to between-block edges
        #####
        sample_assignment = assignment[np.fromiter(vertex_mapping.keys(),
                                                   dtype=np.int32)]
        true_sampled_graph_partition = partition_from_truth(
            sampled_graph, sample_assignment)
        sampled_graph_blockmatrix = true_sampled_graph_partition.get_matrix()
        self.sampled_graph_edge_ratio = sampled_graph_blockmatrix.diagonal(
        ).sum() / sampled_graph_blockmatrix.sum()
        true_full_partition = partition_from_truth(full_graph, assignment)
        full_blockmatrix = true_full_partition.get_matrix()
        self.graph_edge_ratio = full_blockmatrix.diagonal().sum(
        ) / full_blockmatrix.sum()

        #####
        # Normalized difference from ideal-block membership
        #####
        membership_size = max(np.max(assignment),
                              np.max(sample_assignment)) + 1
        full_graph_membership_nums = np.zeros(membership_size)
        for block_membership in assignment:
            full_graph_membership_nums[block_membership] += 1
        sampled_graph_membership_nums = np.zeros(membership_size)
        for block_membership in sample_assignment:
            sampled_graph_membership_nums[block_membership] += 1
        ideal_block_membership_nums = full_graph_membership_nums * \
            (sampled_graph.num_vertices() / full_graph.num_vertices())
        difference_from_ideal_block_membership_nums = np.abs(
            ideal_block_membership_nums - sampled_graph_membership_nums)
        self.difference_from_ideal_sample = np.sum(
            difference_from_ideal_block_membership_nums /
            sampled_graph.num_vertices())