예제 #1
0
    def cluster(self, verbose=False):
        # data = defaultdict(Set) # node id => neighboring node ids

        data = Dict()
        # read in graph
        with open(self.filename, 'r') as f:
            counter = 0
            for line in f:
                a, b = line.split()[:2]
                counter += 1
                if a in data:
                    data[a].add(b)
                else:
                    data[a] = Set()
                    data[a].add(b)
                if b in data:
                    data[b].add(a)
                else:
                    data[b] = Set()
                    data[b].add(a)

        # weights = defaultdict(int)
        weights = Dict()
        for a, b in combinations(data, 2):
            if b not in data[a]: continue
            shared = len(data[a] & data[b])
            if a in weights:
                weights[a] += shared
            else:
                weights[a] = 0
                weights[a] += shared
            if b in weights:
                weights[b] += shared
            else:
                weights[b] = 0
                weights[b] += shared

        unvisited = Set(data)
        num_clusters = 0
        clusters = []

        # print(unvisited)
        # return 0

        # Potential culprit
        seed_nodes = sorted(data,
                            key=lambda k: (weights[k], len(data[k])),
                            reverse=True)

        for seed in seed_nodes:  # get highest degree node
            if seed not in unvisited: continue

            cluster = Set(
                (seed, next(iter(data[seed]))))  # seed and random neighbor

            while True:
                # rank neighbors by the number of edges between the node and cluster nodes
                frontier = sorted((len(data[p] & cluster), p)
                                  for p in Set.union(*((data[n] - cluster)
                                                       for n in cluster)))

                # do this until IN_vk < T_IN, SP <= 2 is met, or no frontier nodes left
                found = False
                while frontier and not found:
                    m_vk, p = frontier.pop()
                    if m_vk < self.t_in * len(cluster): break
                    c_2neighbors = data[p] & cluster
                    c_2neighbors.update(*(data[c] & cluster
                                          for c in c_2neighbors))
                    if cluster == c_2neighbors:
                        found = True
                        break

                if not found: break

                # otherwise, add the node to the cluster
                cluster.add(p)

            unvisited -= cluster

            if verbose:
                print(' '.join(cluster))

            num_clusters += 1

            if verbose:
                print(num_clusters, len(cluster), len(unvisited))

            clusters.append(cluster)

            if not unvisited: break

        self.clusters = clusters


# if __name__ == '__main__':
#     filename = "../data/unweighted_example_network.txt"
#     c = IPCA(filename)
#     c.cluster()
예제 #2
0
def ipca(filename):
    data = defaultdict(Set) # node id => neighboring node ids

    # read in graph
    with open(filename, 'r') as f:
        counter = 0
        for line in f:
            a,b = line.split()[:2]
            print(a,b)
            print(counter)
            counter += 1
            data[a].add(b)
            data[b].add(a)

    weights = defaultdict(int)
    for a,b in combinations(data, 2):
        if b not in data[a]: continue   
        shared = len(data[a] & data[b])
        weights[a] += shared
        weights[b] += shared

    unvisited = Set(data)
    num_clusters = 0
    clusters = []
    
    # Potential culprit
    seed_nodes = sorted(data, key=lambda k: (weights[k],len(data[k])), reverse=True)

    for seed in seed_nodes: # get highest degree node
        if seed not in unvisited: continue

        cluster = Set((seed,next(iter(data[seed])))) # seed and random neighbor

        while True:
            # rank neighbors by the number of edges between the node and cluster nodes
            frontier = sorted((len(data[p] & cluster),p) for p in
                Set.union(*((data[n] - cluster) for n in cluster)))

            # do this until IN_vk < T_IN, SP <= 2 is met, or no frontier nodes left
            found = False
            while frontier and not found:
                m_vk,p = frontier.pop()
                if m_vk < T_IN * len(cluster): break
                c_2neighbors = data[p] & cluster
                c_2neighbors.update(*(data[c] & cluster for c in c_2neighbors))
                if cluster == c_2neighbors:
                    found = True
                    break

            if not found: break
                
            # otherwise, add the node to the cluster
            cluster.add(p)

        unvisited -= cluster
        print (' '.join(cluster))

        num_clusters += 1
        print (num_clusters, len(cluster), len(unvisited))

        clusters.append(cluster)

        if not unvisited: break