def __init__(self, nodes=[], edges=[]): self.__graph = DistanceGraph() self.__graph.set_base(2.0) self.__graph.add_nodes_from(nodes) self.__graph.add_edges_from(edges)
class CentralityClustering: def __init__(self, nodes=[], edges=[]): self.__graph = DistanceGraph() self.__graph.set_base(2.0) self.__graph.add_nodes_from(nodes) self.__graph.add_edges_from(edges) def set_corpus_size(self, size): self.__graph.set_results_total(size) def save(self, filepath): self.__graph.save(filepath) def load(self, tag_graph_path): self.__graph.load(tag_graph_path) self.__tag_graph_path = tag_graph_path def build_tag_graph(self, tagged_graph_path, threshold=None): filename = tagged_graph_path # def __init__(self, filename, bound_tag_dist=None, calc_tag_distance=False): tags = Tags(filename, threshold, True) outfilename = filename + '.tags' tags.save_tag_freqs(outfilename) dist_dict = tags.get_tag_dist() tag_freq = tags.get_tag_freq() print 'tag set with distance: %d' % len(tags.get_tag_set()) print 'tag edges with distance: %d' % len(dist_dict.keys()) nodes = [] for node in tags.get_tag_set(): nodes.append((node,tag_freq[node])) #print str((node,tag_freq[node])) dists = [] for key, val in dist_dict.iteritems(): dists.append(val) max_dist, min_dist = max(dists), min(dists) # use opposite distances as weights. edges = [] for key, val in dist_dict.iteritems(): edges.append((key[0],key[1],(max_dist-val)/(max_dist-min_dist))) #edges.sort(thr_fst_cmp, None, True) self.__graph.add_nodes_from(nodes) self.__graph.add_edges_from(edges) self.set_corpus_size(tags.get_corpus_size()) dist_graph_path = filename + '.tags.graph' self.save(dist_graph_path) def __strength(self, inverse=False, try_old=False): strength = {} try: if not try_old: raise None f = open(self.__tag_graph_path+'.strength', 'r') lines = f.readlines() f.close() for line in lines: if line.strip() != '': s = line.split(' ') strength[s[0]] = float(s[1]) print 'strength loaded from file' except: edges = self.__graph.get_graph().edges(data=True) i = 0 for a,b,weight in edges: if not a in strength.keys(): strength[a] = 0.0 if len(strength.keys()) % 1000 == 0: print 'nodes added to strength %d of %d' % (len(strength.keys()),len(self.__graph.get_graph().nodes())) if not b in strength.keys(): strength[b] = 0.0 strength[a] += weight strength[b] += weight if i % 10000 == 0: print 'edges processed to strength %d of %d' % (i,len(self.__graph.get_graph().edges())) i += 1 print 'save strength to file.' f = open(self.__tag_graph_path+'.strength', 'w') for key,val in strength.iteritems(): f.write('%s %f\n' % (key,val)) f.close() return strength def __split_by_float(self, strength_list): ret = {} ret_list = [] if len(strength_list) == 0: return [] last_node, last_flt = strength_list[0] batch = [] for node, flt in strength_list: #print str((node,flt)) if last_flt != flt: ret[last_flt] = batch ret_list.append(last_flt) last_node, last_flt = node, flt batch = [last_node] else: batch.append(node) ret[last_flt] = batch ret_list.append(last_flt) return ret_list, ret def __node_strength(self, graph, node): edges = graph.edges(node, data=True) stren = 0 for a,b,w in edges: stren += w return stren # from paper Generalized Cores by Vladimir Batajelg and Matjaz Zaversnik # p dictionary is typically the degree of a node in a subgraph. def p_cores_centrality(self, p=None): # print 'computing node strength.' # strength_list = [] # for node, str in self.__strength().iteritems(): # strength_list.append((node,str)) # print 'sorting by node strength.' # strength_list.sort(snd_fst_cmp) # print 'splitting by node strength' # strength_list, per_strength = self.__split_by_float(strength_list) # # if strength_list == []: # return [] # # p_cores = {} # last_core = [] # last_strength = per_strength[0][0] # while len(per_strength) > 0: # node = per_strength[0][1].pop(0) # strength = per_strength[0][0] # if len(per_strength[0][1]) == 0: # per_strength.pop(0) # if strength != last_strength: # p_cores.append((last_strength, last_core)) # last_strength = strength # last_core = [node] # else: # pass C_graph = copy.deepcopy(self.__graph.get_graph()) print 'computing node strength min_queue.' min_queue = Heap(snd_fst_cmp) strength = self.__strength() for node, stre in strength.iteritems(): min_queue.push((node,stre)) if len(min_queue) % 1000 == 0: print 'nodes added to min_queue %d' % len(min_queue) if len(min_queue) == 0: return [] print 'computing p-coreness per-se' core = {} while len(min_queue) > 0: if len(min_queue) % 1000 == 0: print 'remaining %d nodes' % len(min_queue) top,stren = min_queue.pop() core[top] = stren neighs = C_graph.neighbors(top) C_graph.delete_node(top) del strength[top] for v in neighs: min_queue.pop_item((v,strength[v])) strength[v] = max(stren, self.__node_strength(C_graph, v)) min_queue.push((v,strength[v])) return core # remove and returns the first remaining centrality layer. # a layer are all the nodes with the same centrality # first layer NOT already assigned def __first_layer(self, cent_list, assigned=set([]), clust_max_size=50): layer = [] if len(cent_list) == 0: return [], [] while len(cent_list) > 0 and cent_list[0][0] in assigned: cent_list.pop(0) if len(cent_list) == 0: return [], [] the_val = cent_list[0][1] drop = 0 for node, val in cent_list: if the_val == val and not node in assigned and len(layer) < clust_max_size: layer.append(node) else: break drop += 1 cent_list = cent_list[drop:] return layer, cent_list def __layer_components(self, layer): graph = self.__graph.get_graph() if len(layer) == 0: return [] comps = [] while len(layer) > 0: first_comp_node = layer.pop() # bfs with intersection queue = [first_comp_node] comp = [] while len(queue) > 0: node = queue.pop(0) comp.append(node) neighs = graph.neighbors(node) intersec_neighs = list(set(neighs).intersection(set(layer))) queue += intersec_neighs layer = list(set(layer).difference(set(intersec_neighs))) comps.append(comp) return comps # types are exists_geq, all_geq, exists_greater, all_greater def __comp_dec_boundary(self, comp, centrality, type): boundary = self.__graph.get_graph().node_boundary(comp) neigh_map = {} dec_boundary = set([]) for boundary_node in boundary: comp_nodes = set(comp).intersection(set(self.__graph.get_graph().neighbors(boundary_node))) if type=='exists_geq': for comp_node in comp_nodes: if centrality[comp_node] >= centrality[boundary_node]: dec_boundary.add(boundary_node) break elif type=='exists_greater': for comp_node in comp_nodes: if centrality[comp_node] > centrality[boundary_node]: dec_boundary.add(boundary_node) break elif type=='all_geq': all = True for comp_node in comp_nodes: all = all and centrality[comp_node] >= centrality[boundary_node] if all: dec_boundary.add(boundary_node) elif type=='all_greater': all = True for comp_node in comp_nodes: all = all and centrality[comp_node] > centrality[boundary_node] if all: dec_boundary.add(boundary_node) return list(dec_boundary) # types are exists_geq, all_geq, exists_greater, all_greater def __comps_dec_boundary(self, layer_comps, centrality, type): comps_neighs = [] for comp in layer_comps: comps_neighs.append(self.__comp_dec_boundary(comp, centrality, type)) return comps_neighs def __comps_boundary(self, layer_comps): comps_neighs = [] for comp in layer_comps: comps_neighs.append(self.__graph.get_graph().node_boundary(comp)) return comps_neighs def centrality_clustering(self, centrality, clust_size_limit=50, dec_type='all_geq'): cent_list = [] for key, val in centrality.iteritems(): cent_list.append((key,val)) cent_list.sort(snd_fst_cmp) cent_list.reverse() clusters = [] assigned = set([]) old_layers = [] while len(cent_list) > 0: layer, cent_list = self.__first_layer(cent_list, assigned, 1) print 'layer extracted size %d' % len(layer) print 'nodes remaining: %d' % len(cent_list) print '-'*30 assigned = assigned.union(set(layer)) layer_comps = self.__layer_components(layer) comps_neighs = self.__comps_dec_boundary(layer_comps, centrality, dec_type) # while there are non-empty neighbors while filter(lambda x:x!=[], comps_neighs) != []: i = 0 for layer_comp, comp_neighs in zip(layer_comps, comps_neighs): if len(comp_neighs) > clust_size_limit: comps_neighs[i] = [] elif comp_neighs != []: layer_comps[i] = layer_comp + comp_neighs assigned = assigned.union(set(comp_neighs)) comps_neighs[i] = self.__comp_dec_boundary(layer_comps[i], centrality, dec_type) i += 1 for layer_comp in layer_comps: print 'cluster added with size %d' % len(layer_comp) print layer_comp print '-'*70 print 'nodes remaining %d' % len(cent_list) clusters += map(lambda x:set(x),layer_comps) return clusters def centrality_clustering2(self, centrality): cent_list = [] assigned = set([]) for key, val in centrality.iteritems(): cent_list.append((key,val)) cent_list.sort(snd_fst_cmp) cent_list.reverse() clusters = [] while len(cent_list) > 0: layer = self.__first_layer(cent_list) assigned.union(set(layer)) layer_comps = self.__layer_components(layer) comps_neighs = self.__comps_neighbors(layer_comps) # assigning nodes neighboring the clusters assignations = self.__assign_nodes(layer_comps, comps_neighs)