def base_graph_edgelist_to_prod_rules(pickle_fname): """ if lcc has more than 500 nodes sample the lcc 2 x 300 lcc1,lcc2 <- sample_graph(g, 2, 300) edgelist <- lcc1,lcc2 1prs_out <- tree1, tree2 :param pickle_fname: :return: """ Info("base_graph_edgelist_to_prod_rules") G = nx.read_gpickle(pickle_fname) subgraph = max(nx.connected_component_subgraphs(G), key=len) results = [] if subgraph.number_of_nodes() > 500: for k, Gprime in enumerate(gs.rwr_sample(subgraph, 2, 300)): # ret generator print k gname = os.path.basename(pickle_fname).rstrip('.p') Gprime.name = gname cc_fname = write_tmp_edgelist(Gprime, k) # subgraph to temp edgelist results.append(cc_fname) else: cc_fname = write_tmp_edgelist(G) results.append(cc_fname) return results
def ref_graph_largest_conn_componet(fname): df = Pandas_DataFrame_From_Edgelist([fname])[0] G = nx.from_pandas_dataframe(df, source='src', target='trg') Gc = max(nx.connected_component_subgraphs(G), key=len) gname = graph_name(fname) num_nodes = Gc.number_of_nodes() subg_fnm_lst = [] ## sample of the graph larger than 500 nodes if num_nodes >= 500: cnt = 0 for Gprime in gs.rwr_sample(G, 2, 300): subg_fnm_lst.append('.{}_lcc_{}.edl'.format(gname, cnt)) try: nx.write_edgelist(Gprime, '.{}_lcc_{}.edl'.format(gname, cnt), data=False) cnt += 1 except Exception, e: print(str(e), '\n!!Error writing to disk') return ""
def nx_edges_to_nddgo_graph_sampling(graph, n, m, peo_h): G = graph if n is None and m is None: return # n = G.number_of_nodes() # m = G.number_of_edges() nbr_nodes = 256 basefname = 'datasets/{}_{}'.format(G.name, peo_h) K = int(math.ceil(.25 * G.number_of_nodes() / nbr_nodes)) # print "--", nbr_nodes, K, '--'; for j, Gprime in enumerate(gs.rwr_sample(G, K, nbr_nodes)): # if gname is "": # # nx.write_edgelist(Gprime, '/tmp/sampled_subgraph_200_{}.tsv'.format(j), delimiter="\t", data=False) # gprime_lst.append(Gprime) # else: # # nx.write_edgelist(Gprime, '/tmp/{}{}.tsv'.format(gname, j), delimiter="\t", data=False) # gprime_lst.append(Gprime) # # print "... files written: /tmp/{}{}.tsv".format(gname, j) edges = Gprime.edges() edges = [(int(e[0]), int(e[1])) for e in edges] df = pd.DataFrame(edges) df.sort_values(by=[0], inplace=True) ofname = basefname + "_{}.dimacs".format(j) if os.path.exists(ofname): break with open(ofname, 'w') as f: f.write('c {}\n'.format(G.name)) f.write('p edge\t{}\t{}\n'.format(n, m)) # for e in df.iterrows(): output_edges = lambda x: f.write("e\t{}\t{}\n".format(x[0], x[1])) df.apply(output_edges, axis=1) # f.write("e\t{}\t{}\n".format(e[0]+1,e[1]+1)) if os.path.exists(ofname): print('Wrote: {}'.format(ofname)) return basefname
def get_sampled_gpickled_graphs(G): G.remove_edges_from(G.selfloop_edges()) print ([x.number_of_nodes() for x in sorted(nx.connected_component_subgraphs(G), key=len)]) # print ([x.number_of_nodes() for x in list(nx.connected_component_subgraphs(G))]) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() graph_checks(G) prod_rules = {} K = 2 n = 300 j = 0 if G.number_of_nodes() >500: for Gprime in rwr_sample(G, K, n): nx.write_gpickle(Gprime, "../datasets/{}_{}.p".format(gn,str(j))) T = quickbb(Gprime) root = list(T)[0] T = make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) new_visit(T, G, prod_rules) j += 1 else: nx.write_gpickle (G, "../datasets/{}.p".format (gn)) T = quickbb (G) root = list (T)[0] T = make_rooted (T, root) T = binarize (T) root = list (T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) new_visit (T, G, prod_rules) ## return prod_rules
def get_hrg_production_rules(edgelist_data_frame, graph_name, tw=False, trials=10, n_subg=2, n_nodes=300, nstats=False): from core.growing import derive_prules_from t_start = time.time() df = edgelist_data_frame if df.shape[1] == 4: G = nx.from_pandas_dataframe(df, 'src', 'trg', edge_attr=True) # whole graph elif df.shape[1] == 3: G = nx.from_pandas_dataframe(df, 'src', 'trg', ['ts']) # whole graph else: G = nx.from_pandas_dataframe(df, 'src', 'trg') G.name = graph_name print "==> read in graph took: {} seconds".format(time.time() - t_start) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() phrg.graph_checks(G) if DBG: print if DBG: print "--------------------" if not DBG: print "-Tree Decomposition-" if DBG: print "--------------------" prod_rules = {} K = n_subg n = n_nodes if num_nodes >= 500: print 'Grande' t_start = time.time() for Gprime in gs.rwr_sample(G, K, n): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) Process(target=td.new_visit, args=( T, G, prod_rules, )).start() else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) # print_treewidth(T) # exit() if DBG: print if DBG: print "--------------------" if DBG: print "- Production Rules -" if DBG: print "--------------------" for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) '''print "++++++++++" df.to_csv('ProdRules/{}_prs.tsv'.format(G.name), header=False, index=False, sep="\t") if os.path.exists('ProdRules/{}_prs.tsv'.format(G.name)): print 'Saved', 'ProdRules/{}_prs.tsv'.format(G.name) else: print "Trouble saving" print "-----------" print [type(x) for x in rules[0]] ''' ''' Graph Generation of Synthetic Graphs Grow graphs usigng the union of rules from sampled sugbgraphs to predict the target order of the original graph ''' hStars = grow_exact_size_hrg_graphs_from_prod_rules( rules, graph_name, G.number_of_nodes(), trials) print '... hStart graphs:', len(hStars) if not os.path.exists(r"Results/"): os.makedirs(r"Results/") with open(r"Results/{}_hstars.pickle".format(graph_name), "wb") as output_file: cPickle.dump(hStars, output_file) if os.path.exists(r"Results/{}_hstars.pickle".format(graph_name)): print "File saved" '''if nstats:
def new_main(args): if not (args['base'] is None): Info("<- converts to dimacs") gn = graph_name(args['base'][0]) f = "../datasets/" + gn + "*.p" files = glob(f) dimacs_lst = transform_edgelist_to_dimacs(files) results = [] trees = explode_to_trees(dimacs_lst, results) pp.pprint(files) pp.pprint(dimacs_lst) pp.pprint(trees) print pp.pprint(results) exit(0) elif not (args['orig'] is None): Info("<- converts edgelist gpickle") f = args['orig'][0] g = load_edgelist(f) # full graph Info("# of conn comp: %d" % len(list(nx.connected_component_subgraphs(g)))) g = largest_conn_comp(f) # largerst conn comp if isinstance(g, list): for k, Gprime in enumerate(g): subg_out_fname = max(graph_name(f).split("."), key=len) subg_out_fname = "../datasets/" + subg_out_fname subg_out_fname += "_{}.p".format(k) cc_fname = nx.write_gpickle( Gprime, subg_out_fname) # subgraph to temp edgelist if os.path.exists(subg_out_fname): Info("Wrote %s" % subg_out_fname) else: subg_out_fname = max(graph_name(f).split("."), key=len) subg_out_fname = "../datasets/" + subg_out_fname subg_out_fname += ".p" cc_fname = nx.write_gpickle(g, subg_out_fname) if os.path.exists(subg_out_fname): Info("Wrote %s" % subg_out_fname) print("done") exit() elif not (args['edgelist2dimacs'] is None): f = args['edgelist2dimacs'][0] pfname = graph_name(f) pfname = "../datasets/{}.p".format(pfname) if not os.path.exists(pfname): Info("File not found, please run:") Info(" python explodingTree.py --orig path/to/edgelist") G = load_edgelist(f) subgraph = max(nx.connected_component_subgraphs(G), key=len) gprime_lst = [] if subgraph.number_of_nodes() > 500: for j, Gprime in enumerate(gs.rwr_sample(subgraph, 2, 300)): Gprime.name = G.name + "_%d" % j gprime_lst.append(convert_graph_obj_2dimacs([Gprime])) print[x for x in gprime_lst] elif not (args['prules'] is None): gn = graph_name(args['prules'][0]) print gn f = "../datasets/" + gn + "*.tree" files = glob(f) f = "../datasets/" + gn + "*.p" graphs = glob(f) for g in graphs: for f in files: dimacs_td_ct_fast(g, f) # dimacs to tree (decomposition) exit(0) elif not (args['td'] is None): origG = args['td'][0] dimacs_f = glob("../datasets/" + graph_name(args['td'][0]) + "*.dimacs") ''' "Explode to trees" ''' # ToDo var_els = ['mcs', 'mind', 'minf', 'mmd', 'lexm', 'mcsm'] for j, f in enumerate(dimacs_f): print f gn = graph_name(f) dimacs_file = "../datasets/{}.dimacs".format(gn) p = mp.Pool(processes=2) for vael in var_els: p.apply_async(dimacs_nddgo_tree_simple, args=( dimacs_file, vael, ), callback=collect_results_trees) # xt.dimacs_nddgo_tree_simple(f, vael) p.close() p.join() # dimacs_td_ct_fast(oriG, tdfname) # dimacs to tree (decomposition) else: sys.exit(0) # dimacs_convert_orig_graph(args['orig']) pickle_fname = "../datasets/" + f + ".p" g = nx.read_gpickle(pickle_fname) subgraph = max(nx.connected_component_subgraphs(g), key=len) if subgraph.number_of_nodes() > 500: for Gprime in gs.rwr_sample(subgraph, 2, 300): edgelist_in_dimacs_out(Gprime)
def get_phrg_production_rules (argmnts): args = argmnts t_start = time.time() df = tdf.Pandas_DataFrame_From_Edgelist(args['orig'])[0] if df.shape[1] == 4: G = nx.from_pandas_dataframe(df, 'src', 'trg', edge_attr=True) # whole graph elif df.shape[1] == 3: G = nx.from_pandas_dataframe(df, 'src', 'trg', ['ts']) # whole graph else: G = nx.from_pandas_dataframe(df, 'src', 'trg') G.name = graph_name(args['orig'][0]) print "==> read in graph took: {} seconds".format(time.time() - t_start) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() phrg.graph_checks(G) if DBG: print if DBG: print "--------------------" if not DBG: print "-Tree Decomposition-" if DBG: print "--------------------" prod_rules = {} K = 2 n = 300 if num_nodes >= 500: print 'Grande' t_start = time.time() for Gprime in gs.rwr_sample(G, K, n): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) Process(target=td.new_visit, args=(T, G, prod_rules,)).start() else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) # print_treewidth(T) # TODO: needs to be fixed # exit() if DBG: print if DBG: print "--------------------" if DBG: print "- Production Rules -" if DBG: print "--------------------" for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) # pp.pprint(df.values.tolist()); exit() df.to_csv('../ProdRules/{}.tsv.phrg.prs'.format(G.name), header=False, index=False, sep="\t") if os.path.exists('../ProdRules/{}.tsv.phrg.prs'.format(G.name)): print 'Saved', '../ProdRules/{}.tsv.phrg.prs'.format(G.name) else: print "Trouble saving" print "-----------" print [type(x) for x in rules[0]] '''
def get_hrg_production_rules_given(G, tw=False, n_subg=2, n_nodes=300, nstats=False): G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() phrg.graph_checks(G) if DBG: print if DBG: print "--------------------" if not DBG: print "-Tree Decomposition-" if DBG: print "--------------------" prod_rules = {} K = n_subg n = n_nodes if num_nodes >= 500: print 'Grande' t_start = time.time() for Gprime in gs.rwr_sample(G, K, n): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) Process(target=td.new_visit, args=( T, G, prod_rules, )).start() else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) # print_treewidth(T) exit() if DBG: print if DBG: print "--------------------" if DBG: print "- Production Rules -" if DBG: print "--------------------" for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) ''' Graph Generation of Synthetic Graphs Grow graphs usigng the union of rules from sampled sugbgraphs to predict the target order of the original graph exact change to fixed ''' # hStars = grow_exact_size_hrg_graphs_from_prod_rules(rules, graph_name, G.number_of_nodes(), 10) hStars = grow_hrg_graphs_with_infinity(rules, graph_name, G.number_of_nodes(), 10, rnbr=1) print '... hStart graphs:', len(hStars) d = {graph_name + "_hstars": hStars} with open(r"Results/{}_hstars.pickle".format(graph_name), "wb") as output_file: cPickle.dump(d, output_file) if os.path.exists(r"Results/{}_hstars.pickle".format(graph_name)): print "File saved" '''if nstats: