def get_graph(filename='sample') -> LightMultiGraph: start_time = time() if filename == 'sample': # g = nx.MultiGraph() g = nx.Graph() g.add_edges_from([(1, 2), (1, 3), (1, 5), (2, 4), (2, 5), (2, 7), (3, 4), (3, 5), (4, 5), (4, 9), (6, 7), (6, 8), (6, 9), (7, 8), (7, 9), (8, 9)]) elif filename == 'BA': g = nx.barabasi_albert_graph(10, 2, seed=42) # g = nx.MultiGraph(g) g = nx.Graph() else: g = nx.read_edgelist(f'./src/tmp/{filename}.g', nodetype=int, create_using=nx.Graph()) # g = nx.MultiGraph(g) if not nx.is_connected(g): g = max(nx.connected_component_subgraphs(g), key=len) name = g.name g = nx.convert_node_labels_to_integers(g) g.name = name g_new = LightMultiGraph() g_new.add_edges_from(g.edges()) end_time = time() - start_time print( f'Graph: {filename}, n = {g.order():_d}, m = {g.size():_d} read in {round(end_time, 3):_g}s.' ) return g_new
def create_rule(subtree: Set[int], g: LightMultiGraph, mode: str) -> Tuple[PartRule, List[Tuple[int, int]]]: sg = g.subgraph(subtree).copy() assert isinstance(sg, LightMultiGraph) boundary_edges = find_boundary_edges(g, subtree) if mode == 'full': # in the full information case, we add the boundary edges to the RHS and contract it rule = FullRule(lhs=len(boundary_edges), internal_nodes=subtree, graph=sg) for bdry in boundary_edges: if len(bdry) == 2: u, v = bdry rule.graph.add_edge(u, v, b=True) elif len(bdry) == 3: u, v, dd = bdry rule.graph.add_edge(u, v, attr_dict=dd, b=True) rule.contract_rhs() # contract and generalize elif mode == 'part': # in the partial boundary info, we need to set the boundary degrees rule = PartRule(lhs=len(boundary_edges), graph=sg) set_boundary_degrees(g, rule.graph) rule.generalize_rhs() else: rule = NoRule(lhs=len(boundary_edges), graph=sg) rule.generalize_rhs() return rule, boundary_edges
def leiden(g: LightMultiGraph): tree = [] if g.order() < 2: clusters = [[n] for n in g.nodes()] return clusters clusters = leiden_one_level(g) if len(clusters) == 1: clusters = [[n] for n in list(clusters)[0]] return clusters for cluster in clusters: sg = g.subgraph(cluster).copy() # assert nx.is_connected(sg), "subgraph not connected" tree.append(leiden(sg)) return tree
def find_lu(g: LightMultiGraph) -> int: l_u = 1 # for edges node_types = set() for n, d in g.nodes(data=True): if 'label' in d: node_types.add('nt') else: node_types.add('t') l_u += len(node_types) return l_u
def spectral_kmeans(g: LightMultiGraph, K): """ k-way ncut spectral clustering Ng et al. 2002 KNSC1 :param g: graph g :param K: number of clusters :return: """ tree = [] if g.order() <= K: # not more than k nodes, return the list of nodes return [[n] for n in g.nodes()] if K == 2: # if K is two, use approx min partitioning return approx_min_conductance_partitioning(g) if not nx.is_connected(g): for p in nx.connected_component_subgraphs(g): if p.order( ) > K + 1: # if p has more than K + 1 nodes, use spectral K-means tree.append(spectral_kmeans(p, K)) else: # try spectral K-means with a lesser K tree.append(spectral_kmeans(p, K - 1)) assert len(tree) > 0 return tree if K >= g.order() - 2: return spectral_kmeans(g, K - 1) assert nx.is_connected(g), "g is not connected in spectral kmeans" L = nx.laplacian_matrix(g) assert K < g.order() - 2, "k is too high" _, eigenvecs = scipy.sparse.linalg.eigsh( L.asfptype(), k=K + 1, which='SM') # compute the first K+1 eigenvectors eigenvecs = eigenvecs[:, 1:] # discard the first trivial eigenvector U = sklearn.preprocessing.normalize( eigenvecs) # normalize the eigenvecs by its L2 norm kmeans = KMeans(n_clusters=K).fit(U) cluster_labels = kmeans.labels_ clusters = [[] for _ in range(max(cluster_labels) + 1)] for u, clu_u in zip(g.nodes(), cluster_labels): clusters[clu_u].append(u) for cluster in clusters: sg = g.subgraph(cluster) # assert nx.is_connected(sg), "subgraph not connected" if len(cluster) > K + 1: tree.append(spectral_kmeans(sg, K)) else: tree.append(spectral_kmeans(sg, K - 1)) return tree
def get_random_partition(g: LightMultiGraph, seed=None): nodes = list(g.nodes()) if seed is not None: random.seed(seed) random.shuffle(nodes) return random_partition(nodes)
def approx_min_conductance_partitioning(g: LightMultiGraph, max_k=1): """ Approximate minimum conductance partinioning. I'm using the median method as referenced here: http://www.ieor.berkeley.edu/~goldberg/pubs/krishnan-recsys-final2.pdf :param g: graph to recursively partition :param max_k: upper bound of number of nodes allowed in the leaves :return: a dendrogram """ lvl = [] node_list = list(g.nodes()) if len(node_list) <= max_k: assert len(node_list) > 0 return node_list if not nx.is_connected(g): for p in nx.connected_component_subgraphs(g): lvl.append(approx_min_conductance_partitioning(p, max_k)) assert len(lvl) > 0 return lvl assert nx.is_connected(g), "g is not connected in cond" fiedler_vector = nx.fiedler_vector(g, method='lanczos') p1, p2 = set(), set() fiedler_dict = {} for idx, n in enumerate(fiedler_vector): fiedler_dict[idx] = n fiedler_vector = [ (k, fiedler_dict[k]) for k in sorted(fiedler_dict, key=fiedler_dict.get, reverse=True) ] half_idx = len(fiedler_vector) // 2 # floor division for idx, _ in fiedler_vector: if half_idx > 0: p1.add(node_list[idx]) else: p2.add(node_list[idx]) half_idx -= 1 # decrement so halfway through it crosses 0 and puts into p2 sg1 = g.subgraph(p1) sg2 = g.subgraph(p2) iter_count = 0 while not (nx.is_connected(sg1) and nx.is_connected(sg2)): sg1 = g.subgraph(p1) sg2 = g.subgraph(p2) # Hack to check and fix non connected subgraphs if not nx.is_connected(sg1): for sg in sorted(nx.connected_component_subgraphs(sg1), key=len, reverse=True)[1:]: p2.update(sg.nodes()) for n in sg.nodes(): p1.remove(n) sg2 = g.subgraph(p2) # updating sg2 since p2 has changed if not nx.is_connected(sg2): for sg in sorted(nx.connected_component_subgraphs(sg2), key=len, reverse=True)[1:]: p1.update(sg.nodes()) for n in sg.nodes(): p2.remove(n) iter_count += 1 if iter_count > 2: print('it took {} iterations to stabilize'.format(iter_count)) assert nx.is_connected(sg1) and nx.is_connected( sg2), "subgraphs are not connected in cond" lvl.append(approx_min_conductance_partitioning(sg1, max_k)) lvl.append(approx_min_conductance_partitioning(sg2, max_k)) assert (len(lvl) > 0) return lvl
def _generate_graph(rule_dict: Dict[int, List[PartRule]], upper_bound: int) -> Any: """ Create a new graph from the VRG at random Returns None if the nodes in generated graph exceeds upper_bound :return: newly generated graph """ node_counter = 1 new_g = LightMultiGraph() new_g.add_node(0, label=0) non_terminals = {0} rule_ordering = [] # list of rule ids in the order they were fired while len(non_terminals) > 0: # continue until no more non-terminal nodes if new_g.order() > upper_bound: # early stopping return None, None node_sample = random.sample( non_terminals, 1)[0] # choose a non terminal node at random lhs = new_g.nodes[node_sample]['label'] rhs_candidates = rule_dict[lhs] if len(rhs_candidates) == 1: rhs = rhs_candidates[0] else: weights = np.array([rule.frequency for rule in rhs_candidates]) weights = weights / np.sum(weights) # normalize into probabilities idx = int( np.random.choice(range(len(rhs_candidates)), size=1, p=weights)) # pick based on probability rhs = rhs_candidates[idx] logging.debug( f'firing rule {rhs.id}, selecting node {node_sample} with label: {lhs}' ) rule_ordering.append(rhs.id) broken_edges = find_boundary_edges(new_g, {node_sample}) assert len(broken_edges) == lhs new_g.remove_node(node_sample) non_terminals.remove(node_sample) nodes = {} for n, d in rhs.graph.nodes(data=True): # all the nodes are internal new_node = node_counter nodes[n] = new_node label = None if 'label' in d: # if it's a new non-terminal add it to the set of non-terminals non_terminals.add(new_node) label = d['label'] if label is None: new_g.add_node(new_node, b_deg=d['b_deg']) else: new_g.add_node(new_node, b_deg=d['b_deg'], label=label) node_counter += 1 # randomly assign broken edges to boundary edges random.shuffle(broken_edges) # randomly joining the new boundary edges from the RHS to the rest of the graph - uniformly at random for n, d in rhs.graph.nodes(data=True): num_boundary_edges = d['b_deg'] if num_boundary_edges == 0: # there are no boundary edges incident to that node continue assert len(broken_edges) >= num_boundary_edges edge_candidates = broken_edges[: num_boundary_edges] # picking the first num_broken edges broken_edges = broken_edges[ num_boundary_edges:] # removing them from future consideration for u, v in edge_candidates: # each edge is either (node_sample, v) or (u, node_sample) if u == node_sample: u = nodes[n] else: v = nodes[n] logging.debug(f'adding broken edge ({u}, {v})') new_g.add_edge(u, v) # adding the rhs to the new graph for u, v in rhs.graph.edges(): edge_multiplicity = rhs.graph[u][v]['weight'] # new_g.add_edge(nodes[u], nodes[v], weight=edge_multiplicity) logging.debug( f'adding RHS internal edge ({nodes[u]}, {nodes[v]}) wt: {edge_multiplicity}' ) return new_g, rule_ordering
def generate_graph(rule_dict, rule_list): """ Create a new graph from the VRG at random :param rule_dict: List of unique VRG rules :return: newly generated graph """ node_counter = 1 non_terminals = set() # new_g = nx.MultiGraph() new_g = LightMultiGraph() new_g.add_node(0, label=0) non_terminals.add(0) rule_ordering = [] # list of rule ids in the order they were fired while len(non_terminals) > 0: # continue until no more non-terminal nodes # choose a non terminal node at random node_sample = random.sample(non_terminals, 1)[0] lhs = new_g.nodes[node_sample]['label'] rhs_candidates = list( filter(lambda rule: rule.is_active, rule_dict[lhs])) # consider only active rules if len(rhs_candidates) == 1: rhs = rhs_candidates[0] else: weights = np.array([rule.frequency for rule in rhs_candidates]) weights = weights / np.sum(weights) # normalize into probabilities idx = int( np.random.choice(range(len(rhs_candidates)), size=1, p=weights)) # pick based on probability rhs = rhs_candidates[idx] # print(f'firing rule {rule_list.index(rhs)}') # rule_ordering.append(rule_list.index(rhs)) # print('Selected node {} with label {}'.format(node_sample, lhs)) broken_edges = find_boundary_edges(new_g, [node_sample]) # print('broken edges: ', broken_edges) assert len(broken_edges) == lhs new_g.remove_node(node_sample) non_terminals.remove(node_sample) nodes = {} for n, d in rhs.graph.nodes(data=True): # all the nodes are internal new_node = node_counter nodes[n] = new_node new_g.add_node(new_node, attr_dict=d) if 'label' in d: # if it's a new non-terminal add it to the set of non-terminals non_terminals.add(new_node) node_counter += 1 # randomly assign broken edges to boundary edges random.shuffle(broken_edges) # randomly joining the new boundary edges from the RHS to the rest of the graph - uniformly at random for n, d in rhs.graph.nodes(data=True): num_boundary_edges = d['b_deg'] if num_boundary_edges == 0: # there are no boundary edges incident to that node continue assert len(broken_edges) >= num_boundary_edges edge_candidates = broken_edges[: num_boundary_edges] # picking the first num_broken edges broken_edges = broken_edges[ num_boundary_edges:] # removing them from future consideration for u, v in edge_candidates: # each edge is either (node_sample, v) or (u, node_sample) if u == node_sample: u = nodes[n] else: v = nodes[n] # print('adding broken edge ({}, {})'.format(u, v)) new_g.add_edge(u, v) # adding the rhs to the new graph for u, v in rhs.graph.edges(): # print('adding RHS internal edge ({}, {})'.format(nodes[u], nodes[v])) edge_multiplicity = rhs.graph[u][v]['weight'] # for _ in range(edge_multiplicity): new_g.add_edge(nodes[u], nodes[v]) return new_g, rule_ordering
edge_candidates = broken_edges[: num_boundary_edges] # picking the first num_broken edges broken_edges = broken_edges[ num_boundary_edges:] # removing them from future consideration for u, v in edge_candidates: # each edge is either (node_sample, v) or (u, node_sample) if u == node_sample: u = nodes[n] else: v = nodes[n] # print('adding broken edge ({}, {})'.format(u, v)) new_g.add_edge(u, v) # adding the rhs to the new graph for u, v in rhs.graph.edges(): # print('adding RHS internal edge ({}, {})'.format(nodes[u], nodes[v])) edge_multiplicity = rhs.graph[u][v]['weight'] # for _ in range(edge_multiplicity): new_g.add_edge(nodes[u], nodes[v]) return new_g, rule_ordering if __name__ == '__main__': g = LightMultiGraph() g.add_edges_from([(1, 2), (1, 2), (1, 3), (2, 3), (3, 4)]) sg = g.subgraph([2, 3]).copy() print(g.edges(data=True)) set_boundary_degrees(g, sg) print(sg.nodes(data=True))
def compress_graph(g: LightMultiGraph, subtree: Set[int], boundary_edges: Any, permanent: bool) -> Union[None, float]: """ :param g: the graph :param subtree: the set of nodes that's compressed :param boundary_edges: boundary edges :param permanent: if disabled, undo the compression after computing the new dl -> returns the float :return: """ assert len(subtree) > 0, f'Empty subtree g:{g.order(), g.size()}, bound: {boundary_edges}' before = (g.order(), g.size()) if not isinstance(subtree, set): subtree = set(subtree) if boundary_edges is None: # compute the boundary edges boundary_edges = find_boundary_edges(g, subtree) removed_edges = set() removed_nodes = set() # step 1: remove the nodes from subtree, keep track of the removed edges if not permanent: removed_edges = list(g.subgraph(subtree).edges(data=True)) removed_nodes = list(g.subgraph(subtree).nodes(data=True)) g.remove_nodes_from(subtree) new_node = min(subtree) # step 2: replace subtree with new_node g.add_node(new_node, label=len(boundary_edges)) # step 3: rewire new_node for bdry in boundary_edges: if len(bdry) == 2: u, v = bdry if u in subtree: u = new_node if v in subtree: v = new_node g.add_edge(u, v) elif len(bdry) == 3: u, v, d = bdry if u in subtree: u = new_node if v in subtree: v = new_node g.add_edge(u, v, d) if not permanent: # if this flag is set, then return the graph dl of the compressed graph and undo the changes compressed_graph_dl = graph_dl(g) # print(f'In compress_graph, dl after change: {compressed_graph_dl:_g}') g.remove_node(new_node) # and the boundary edges g.add_nodes_from(removed_nodes) # add the subtree for e in itertools.chain(removed_edges, boundary_edges): if len(e) == 3: u, v, d = e else: u, v = e d = {'weight': 1} if 'edge_colors' in d.keys(): g.add_edge(u, v, weight=d['weight'], edge_colors=d['edge_colors']) else: g.add_edge(u, v, weight=d['weight']) after = (g.order(), g.size()) assert before == after, 'Decompression did not work' return compressed_graph_dl else: return None
if __name__ == '__main__': name = 'lesmis' outdir = 'output' # clustering = 'leiden' clustering = 'cond' type = 'mu_level' mu = 3 g_ = nx.Graph() g_.add_edges_from([(1, 2), (1, 3), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5), (2, 7), (4, 9), (6, 7), (6, 8), (6, 9), (7, 8), (7, 9), (8, 9)]) g = LightMultiGraph() g.add_edges_from(g_.edges()) root = pickle.load(open('../output/trees/sample/cond_tree.pkl', 'rb')) print(root) grammar = VRG(clustering=clustering, type=type, name=name, mu=mu) # extractor = MuExtractor(g=g, type=type, mu=mu, grammar=grammar, root=root) # extractor = LocalExtractor(g=g, type=type, mu=mu, grammar=grammar, root=root) extractor = GlobalExtractor(g=g, type=type, mu=mu, grammar=grammar, root=root) key2node = {} s = [extractor.root] while len(s) != 0: tnode = s.pop() key2node[tnode.key] = tnode
name, clustering, mode, mu, type, outdir = args.graph, args.clustering, args.boundary, args.mu, \ args.type, args.outdir grammar, orig_n = get_grammar_s(original_graph=g, name=name, grammar_type=type, clustering=clustering, mu=mu) g = generate_graph(rule_dict=grammar.rule_dict, target_n=orig_n) ng = g[0] return list(ng.edges()) class dotdict(dict): """dot.notation access to dictionary attributes""" __getattr__ = dict.get __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ args_d= { "graph": "chem", 'clustering': 'leiden', 'boundary':'part', 'mu': 4, 'type': 'mu_level_dl', 'outdir': 'output', 'n': 5} # type(args_dict) args_dict = dotdict(args_d) print(args_dict.graph) g_new= LightMultiGraph() g = nx.karate_club_graph() g_new.add_edges_from(g.edges()) new_grs = cnrg_learn_grammars_probabilistic_graph_generation(g_new,args_dict) print(new_grs)
def get_graph(name='sample', path_input='', path_node_attrs='', path_edge_attrs='', path_timestamps='') -> LightMultiGraph: start_time = time() if path_input == '': if name == 'sample': # g = nx.MultiGraph() g = nx.Graph() g.add_edges_from([(1, 2), (1, 3), (1, 5), (2, 4), (2, 5), (2, 7), (3, 4), (3, 5), (4, 5), (4, 9), (6, 7), (6, 8), (6, 9), (7, 8), (7, 9), (8, 9)]) elif name == 'BA': g = nx.barabasi_albert_graph(10, 2, seed=42) # g = nx.MultiGraph(g) g = nx.Graph() else: g = nx.read_edgelist(f'./src/tmp/{name}.g', nodetype=int, create_using=nx.Graph()) g.name = name # g = nx.MultiGraph(g) if not nx.is_connected(g): g = max(nx.connected_component_subgraphs(g), key=len) name = g.name g = nx.convert_node_labels_to_integers(g) g.name = name else: g = nx.read_edgelist(path_input, nodetype=int, create_using=nx.Graph()) if not nx.is_connected(g): g = max(nx.connected_component_subgraphs(g), key=len) #g = nx.convert_node_labels_to_integers(g) g.name = name g_new = LightMultiGraph() g_new.add_edges_from(g.edges()) # a node attribute is a list of "colors" if path_node_attrs != '': node_attrs = {} with open(path_node_attrs) as infile: for line in infile: v, attr = line.strip().replace('\t', ' ').replace(',', ' ').split(' ') node_attrs[int(v)] = [attr] nx.set_node_attributes(g_new, node_attrs, 'node_colors') # an edge attribute is a list of "colors" if path_edge_attrs != '': edge_attrs = {} with open(path_edge_attrs) as infile: for line in infile: u, v, attr = line.strip().replace('\t', ' ').replace(',', ' ').split(' ') edge_attrs[(int(u), int(v))] = [attr] nx.set_edge_attributes(g_new, edge_attrs, 'edge_colors') # an edge timestamp is a floating-point number if path_timestamps != '': edge_attrs = {} with open(path_timestamps) as infile: for line in infile: u, v, timestamp = line.strip().replace('\t', ' ').replace( ',', ' ').split(' ') edge_attrs[(int(u), int(v))] = float(timestamp) nx.set_edge_attributes(g_new, edge_attrs, 'timestamp') end_time = time() - start_time print( f'Graph: {name}, n = {g.order():_d}, m = {g.size():_d} read in {round(end_time, 3):_g}s.' ) return g_new