def test_pg_hypothesis_checking_batch(): g_uns = nx.DiGraph() g_uns.add_edges_from((('A', 'B'), ('A', 'C'), ('C', 'D'), ('B', 'D'), ('D', 'B'), ('D', 'C'), ('B', 'E'), ('C', 'E'))) # For reference, these are the paths in this graph # [('A', 'B', 'D', 'B', 'E'), # ('A', 'B', 'D', 'C', 'E'), # ('A', 'C', 'D', 'B', 'E'), # ('A', 'C', 'D', 'C', 'E')]) # with 3 of them containing 'B' and one not containing 'B' source, target, length = ('A', 'E', 4) pg = PathsGraph.from_graph(g_uns, source, target, length) # We want to check that B is on the path somewhere formula = 'F([B])' # We set up a hypothesis test saying that we want to verify that # the formula is True with at least 0.2 probability (i.e. the probability # that a randomly samples path satisfies the formula). We allow # a Type-I error rate of 0.1 and Type-II error rate of 0.1, and use # a 0.01 indifference parameter around the value 0.2. ht = HypothesisTester(0.2, 0.1, 0.1, 0.01) # We next sample paths one by one until we can decide the hypothesis test samples = [] while True: # Sample one path path = pg.sample_paths(1)[0] # Verify the path pc = PathChecker(formula, path) samples.append(pc.truth) # Check if the samples so far are sufficient to decide the hypothesis # and stop sampling if they are hyp = ht.test(samples) if hyp is not None: break assert hyp == 0
def scaling_random_graphs(num_samples, min_size, max_size, edge_prob=0.5): data_shape = (max_size - min_size + 1, num_samples) times_nx_paths = np.empty(data_shape) times_pg = np.empty(data_shape) times_cfpg = np.empty(data_shape) # Iterate over number of nodes in network for i, num_nodes in enumerate(range(min_size, max_size+1)): print(f'Number of nodes in network: {num_nodes}') # Iterate over num_samples random graphs of this size for j in range(num_samples): print(f'Sample {j}') # Generate a random graph rg = nx.erdos_renyi_graph(num_nodes, edge_prob, directed=True) # Select two nodes as source and target source = 0 target = num_nodes - 1 # Time to compute all simple paths with path probabilities start = time.time() paths = [tuple(p) for p in nx.all_simple_paths(rg, source, target)] #paths2 = [tuple(p) for p in nx.shortest_simple_paths(rg, source, target)] #assert(set(paths) == set(paths2)) # Now build a path tree from the paths and calculate probabilities pt = PathsTree(paths) path_probs = pt.path_probabilities() # Save the time it took the calculate end = time.time() elapsed = end - start times_nx_paths[i, j] = elapsed # Time to compute paths_graphs and make combined graph pg_start = time.time() f_level, b_level = get_reachable_sets(rg, source, target, num_nodes) pg_list = [] for length in range(1, num_nodes): pg = PathsGraph.from_graph(rg, source, target, length, f_level, b_level) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) # NOTE: no count_paths method total_paths = combined_pg.count_paths() print(f'Total paths (with cycles): {total_paths}') #cf_paths = combined_pg.sample_cf_paths(100000) pg_elapsed = time.time() - pg_start times_pg[i, j] = pg_elapsed # Now compute the CFPG cfpg_list = [] for pg in pg_list: cfpg = CFPG.from_pg(pg) cfpg_list.append(cfpg) cfpg_elapsed = time.time() - pg_start times_cfpg[i, j] = cfpg_elapsed return times_nx_paths, times_pg, times_cfpg
def run_pg_vs_nx(graph, source, target, depth, num_samples): # PG sampling start = time.time() f_level, b_level = get_reachable_sets(graph, source, target, depth) pg_list = [] for i in range(1, depth + 1): pg = PathsGraph.from_graph(graph, source, target, i, f_level, b_level) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) print("Sampling from PG") cf_paths = [] while len(cf_paths) < num_samples: print(f'{len(cf_paths)} / {num_samples}') cf_path_chunk = combined_pg.sample_paths(100) #cf_paths = [] end = time.time() #print("Done sampling from PG") print("Done generating PGs") pg_elapsed = end - start # Networkx enumeration index = 0 start = time.time() nx_paths = [] nx_sampled_paths = [] """ for p in nx.all_simple_paths(graph, source, target, cutoff=depth): nx_paths.append(tuple(p)) if index % 10000 == 0: print(index) index += 1 #print("Making PathsTree") #paths_tree = PathsTree(nx_paths) #print("Sampling PathsTree") #nx_sampled_paths = paths_tree.sample(num_samples) end = time.time() nx_elapsed = end - start #assert set(cf_paths) <= set(nx_paths) print("all_simple_paths done") print("Total paths (nx):", len(nx_paths)) print("Unique sampled paths (pg):", len(set(cf_paths))) #print("Unique sampled_paths (tree):", len(set(nx_sampled_paths))) print("NX time", nx_elapsed) print("PG time", pg_elapsed) nx_sampled_paths = [] """ nx_elapsed = 0 return { 'pg_list': pg_list, 'pg_paths': cf_paths, 'nx_paths': nx_paths, 'nx_paths_sampled': nx_sampled_paths, 'pg_time': pg_elapsed, 'nx_time': nx_elapsed }
def from_graph(klass, *args, **kwargs): """Get an instance of a CFPG from a graph. Parameters ---------- g : networkx.DiGraph The underlying graph on which paths will be generated. source : str Name of the source node. target : str Name of the target node. target_polarity : int Whether the desired path from source to target is positive (0) or negative (1). length : int Length of paths to compute. fwd_reachset : Optional[dict] Dictionary of sets representing the forward reachset computed over the original graph g up to a maximum depth greater than the requested path length. If not provided, the forward reach set is calculated up to the requested path length up to the requested path length by calling paths_graph.get_reachable_sets. back_reachset : Optional[dict] Dictionary of sets representing the backward reachset computed over the original graph g up to a maximum depth greater than the requested path length. If not provided, the backward reach set is calculated up to the requested path length up to the requested path length by calling paths_graph.get_reachable_sets. signed : bool Specifies whether the underlying graph and the corresponding f_level and b_level reachable sets have signed edges. If True, sign information should be encoded in the 'sign' field of the edge data, with 0 indicating a positive edge and 1 indicating a negative edge. target_polarity : 0 or 1 Specifies the polarity of the target node: 0 indicates positive/activation, 1 indicates negative/inhibition. Returns ------- CFPG Instance of CFPG class representing cycle-free paths from source to target with a given length and overall polarity. """ #pre_cfpg = PreCFPG.from_graph(*args, **kwargs) pg = PathsGraph.from_graph(*args, **kwargs) return klass.from_pg(pg)
print("Getting reachable sets") fwd_reach, back_reach = get_reachable_sets(g, source, target, max_depth, signed=False) print("Building PG") pg_list = [] for cur_length in range(1, max_depth + 1): print("Building paths graph for length %d" % cur_length) pg = PathsGraph.from_graph(g, source, target, cur_length, fwd_reach, back_reach, signed=False, target_polarity=0) pg_list.append(pg) print("Building combined paths graph") cpg = CombinedPathsGraph(pg_list) print("Sampling %d paths" % num_samples) paths = cpg.sample_cf_paths(num_samples) path_ctr = Counter(paths) path_ctr = sorted([(k, v) for k, v in path_ctr.items()], key=lambda x: x[1], reverse=True)
def run_pg_cfpg(rg, source, target): num_nodes = len(rg) # Time to compute paths_graphs and make combined graph pg_start = time.time() f_level, b_level = get_reachable_sets(rg, source, target, num_nodes) pg_list = [] for length in range(1, num_nodes): pg = PathsGraph.from_graph(rg, source, target, length, f_level, b_level) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) ht = HypothesisTester(0.5, 0.1, 0.1, 0.05) tf = None tfs = [] nsamples = 0 batch = 10 while tf is None: new_paths = combined_pg.sample_cf_paths(batch) if not new_paths: tf = 0 break tfs += [exists_property(p, 5) for p in new_paths] nsamples += batch tf = ht.test(tfs) print(f'PG: {tf} based on {nsamples} samples') # cf_paths = combined_pg.sample_cf_paths(10000) # print(prob_ascending_path(cf_paths)) pg_elapsed = time.time() - pg_start print(f'PG: {pg_elapsed:.2f}s') # Now compute the CFPG cfpg_list = [] for pg in pg_list: cfpg = CFPG.from_pg(pg) cfpg_list.append(cfpg) ccfpg = CombinedCFPG(cfpg_list) print('Sampling CFPG') ht = HypothesisTester(0.5, 0.1, 0.1, 0.05) tf = None tfs = [] nsamples = 0 batch = 10 while tf is None: new_paths = ccfpg.sample_paths(batch) if not new_paths: tf = 0 break tfs += [exists_property(p, 5) for p in new_paths] nsamples += batch tf = ht.test(tfs) print(f'CFPG: {tf} based on {nsamples} samples') #cfpg_paths = ccfpg.sample_paths(10000) #print(prob_ascending_path(cfpg_paths)) cfpg_elapsed = time.time() - pg_start print(f'CFPG: {cfpg_elapsed:.2f}s') return pg_elapsed, cfpg_elapsed
target = chek2_node depth = 6 num_samples = 1000 f_level, b_level = get_reachable_sets(graph, source, target, depth, signed=True) pg_list = [] for i in range(1, depth + 1): pg = PathsGraph.from_graph(graph, source, target, i, f_level, b_level, signed=True, target_polarity=1) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) cf_paths = combined_pg.sample_cf_paths(num_samples) """ dist = get_node_distribution(cf_paths, None, None) dist_filt = [(n[0][0][2], n[1]) for n in dist] dist_filt = [n for n in dist_filt if n[0] not in ['o', 'SRC', 'CHEK2']] node_dist = dist_filt str_names, freqs = zip(*node_dist) num_genes = 30 plt.ion()
# Add a dummy source graph_file = '../input/july_2018_pa_HGNC_FPLX_typed_directional_pairs.tsv' graph = load_stmt_graph(graph_file) dummy_edges = [('SOURCE', src[1]) for src in source_list] dummy_edges += [(tgt[1], 'TARGET') for tgt in target_list] graph.add_edges_from(dummy_edges) max_depth = 8 pg_list = [] lengths = [] stmt_counts = [] f_level, b_level = get_reachable_sets(graph, 'SOURCE', 'TARGET', max_depth) for length in range(3, max_depth + 1): pg = PathsGraph.from_graph(graph, 'SOURCE', 'TARGET', length, fwd_reachset=f_level, back_reachset=b_level) stmt_hashes = get_stmt_hashes_from_pg(graph, pg) print("%d stmts for paths of length %d" % (len(stmt_hashes), length - 2)) pg_list.append(pg) lengths.append(length - 2) stmt_counts.append(len(stmt_hashes)) plt.ion() plt.plot(lengths, stmt_hashes) ax = plt.gca() ax.set_yscale('log')
# Draw G draw(g, join(output_dir, 'toy_g.pdf')) depth = 4 source = 'S' target = 'T' f_level, b_level = get_reachable_sets(g, source, target, depth) draw_reachset(g, f_level, 'forward', depth, output_dir) draw_reachset(g, b_level, 'backward', depth, output_dir) print("f_level", f_level) print("b_level", b_level) pg = PathsGraph.from_graph(g, source, target, depth) draw(pg.graph, join(output_dir, 'toy_pg_%d.pdf' % depth)) # Combined paths graph pg_list = [] for i in range(1, 4+1): pg_list.append(PathsGraph.from_graph(g, source, target, i)) cpg = CombinedPathsGraph(pg_list) draw(cpg.graph, join(output_dir, 'toy_combined_pg.pdf')) # Cycle-free paths graph cfpg = CFPG.from_pg(pg) # Remove the frozensets for drawing cfpg_edges_fixed = [] for u, v in cfpg.graph.edges(): u_set = '{}' if u[2] == 0 else str(set(u[2]))