def main(): network_file = "interactions.sif" seed_file = "seeds.txt" scoring_folder = "./test/" executable_path = "/home/emre/arastirma/netzcore/src/scoreNetwork/scoreN" # Create input files for scoring prepare_scoring(network_file, seed_file, scoring_folder, non_seed_score=0.01, seed_score=1.0, edge_score=1.0, n_sample=100, delim=" ") # Run GUILD and create output files, the case for Netcombo run_scoring(scoring_folder, executable_path, scoring_type="netcombo") #run_scoring(scoring_folder, executable_path, scoring_type="netzcore", parameters={"n_iteration":5, "n_sample":100, "sampling_prefix":scoring_folder+"sampled_graph."}, qname=None) # Generate cross validation files node_scores_file = scoring_folder + "node_scores.sif" edge_scores_file = scoring_folder + "edge_scores_netshort.sif" # fill the code to get nodes, seed_to_score, edges and edge_to_score variables below g = network_utilities.create_network_from_sif_file(network_file, use_edge_data=True) seeds = guild_utilities.get_nodes(seed_file) nodes = g.nodes() edges = g.edges() seed_to_score = dict([(node, 1) for node in seeds]) edge_to_score = dict([((u,v), 1) for u,v in edges]) guild_utilities.generate_cross_validation_node_score_files(nodes, seed_to_score, node_scores_file, xval = 3, default_score = 0.01, replicable = 123) guild_utilities.generate_cross_validation_edge_score_as_node_score_files(edges, seed_to_score, edge_to_score, edge_scores_file, xval = 3, default_score = 0.01, replicable = 123) # Run NetScore on these cross validation files guild_utilities.run_scoring(scoring_folder, executable_path, scoring_type="netscore", parameters={"n_iteration":2, "n_repetition":3}, qname=None, calculate_pvalue=True, xval=3) return
def get_number_of_seed_connecting_edges(): from toolbox import network_utilities as gu #base_dir = DATA_DIR + "input/biana_no_tap_no_reliability/" base_dir = DATA_DIR + "input/goh/" network_file = base_dir + "edge_scores.sif" output_file = base_dir + "edge_counts.txt" g = gu.create_network_from_sif_file(network_file, use_edge_data=True) f = open(output_file, 'w') n = float(g.number_of_edges()) for phenotype in omim_phenotypes: node_file = base_dir + phenotype + "/seed_scores.sif" seeds = [ line.strip().split()[0] for line in open(node_file) ] n_seed = 0 n_seed_nonseed = 0 n_nonseed = 0 for u, v in g.edges(): if u in seeds and v in seeds: n_seed += 1 elif u in seeds or v in seeds: n_seed_nonseed += 1 else: n_nonseed += 1 #n_seed /= n #n_seed_nonseed /= n #n_nonseed /= n f.write("%s %d %d %d\n" % (phenotype, n_seed, n_seed_nonseed, n_nonseed)) f.close() return
def get_number_of_seed_connecting_paths(): from toolbox import network_utilities as gu #base_dir = DATA_DIR + "input/biana_no_tap_no_reliability/" base_dir = DATA_DIR + "input/goh/" network_file = base_dir + "edge_scores.sif" output_file = base_dir + "path_counts.txt" g = gu.create_network_from_sif_file(network_file, use_edge_data=True) f = open(output_file, 'w') for phenotype in omim_phenotypes: print phenotype node_file = base_dir + phenotype + "/seed_scores.sif" seeds = [ line.strip().split()[0] for line in open(node_file) ] n = float(len(seeds)) count = 0 path_length = 0 for i, seed1 in enumerate(seeds): for j, seed2 in enumerate(seeds): if i<j: #count += len(find_all_paths(g, seed1, seed2, [])) for path in all_shortest_paths(g, seed1, seed2): count += 1 path_length += len(path) - 1 #print count, count/n, float(path_length)/count f.write("%s %d %f %f\n" % (phenotype, count, count/n, float(path_length)/count)) f.close() return
def output_edge_pvalue_file(network_file, score_file, background_file, seed_file=None, background_seed_file=None, delim=" "): """ Calculates and outputs edge p-values using GUILD scores (node scores are first converted to edge scores and then edge p-values are calculated) """ g = network_utilities.create_network_from_sif_file(network_file) node_to_score = get_node_to_score(score_file) background_to_score = get_node_to_score(background_file) seed_to_score = None background_seed_to_score = None #if seed_file is not None: # seed_to_score = get_node_to_score(seed_file) if background_seed_file is not None: background_seed_to_score = get_node_to_score(background_seed_file) edge_to_score = {} background_edge_to_score = {} for u, v in g.edges(): edge_to_score[(u,v)] = (node_to_score[u] + node_to_score[v]) / 2 if u in background_seed_to_score or v in background_seed_to_score: continue background_edge_to_score[(u,v)] = (background_to_score[u] + background_to_score[v]) / 2 node_to_significance = get_significance_among_node_scores(edge_to_score, background_edge_to_score) values = [ (v, k) for k,v in node_to_significance.iteritems() ] values.sort() i = 0 f = open(score_file + ".edge_pval", 'w') f.write("Id1%sId2%sScore%sP-value\n" % (delim, delim, delim)) for val, edge in values: f.write("%s%s%s%s%f%s%s\n" % (edge[0], delim, edge[1], delim, edge_to_score[edge], delim, str(val))) i += 1 f.close() return
def calculate_proximity_multiple(parameter_file_prefix, i_start, i_end): network_file, nodes_from, nodes_to, out_file, min_bin_size, n_random, n_seed = get_parameters_from_file( parameter_file_prefix + "%s.txt" % i_start) network = network_util.create_network_from_sif_file( network_file, use_edge_data=False, delim=None, include_unconnected=True) bins = network_util.get_degree_binning(network, min_bin_size, lengths=None) for i in range(i_start, i_end): if not os.path.exists(parameter_file_prefix + "%s.txt" % i): print("File does not exists for index (aborting):", i) break network_file, nodes_from, nodes_to, out_file, min_bin_size, n_random, n_seed = get_parameters_from_file( parameter_file_prefix + "%s.txt" % i) if os.path.exists(out_file): print("Skipping existing file for index:", i) continue print(network_file, nodes_from, nodes_to, n_random, min_bin_size, n_seed, out_file) values = wrappers.calculate_proximity(network, nodes_from=nodes_from, nodes_to=nodes_to, bins=bins, n_random=n_random, min_bin_size=min_bin_size, seed=n_seed) if values is not None: # not in network d, z, (m, s) = values # print z, d, (m, s) open(out_file, 'w').write("%f %f %f %f\n" % (z, d, m, s)) return
def get_differential_network(g_org, ueid_to_gene, auc_file, critical_auc): from toolbox import network_utilities as gu from statsmodels.stats.weightstats import ttest_ind # Get indices of min/max networks aucs = [] for line in open(auc_file): aucs.append(float(line.split()[1])) indices = zip(*sorted([ (auc, i) for i, auc in enumerate(aucs) ]))[1] #print indices[:3], indices[-3:] # real index in file name is one higher indices_max = indices[-2:] indices_min = indices[:2] # Get max neighborhood network g_maxs = [ ] for i in indices_max: network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1) g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False) g_maxs.append(g) ##g_maxs.append(g.subgraph(g_neighborhood.nodes())) # Get min neighborhood network g_mins = [ ] for i in indices_min: network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1) g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False) g_mins.append(g) ##g_mins.append(g.subgraph(g_neighborhood.nodes())) print len(g_maxs), len(g_mins) # Get common edges in min/max networks g_max = reduce(lambda x,y: gu.networkx.intersection(x, y), g_maxs) g_min = reduce(lambda x,y: gu.networkx.intersection(x, y), g_mins) # Get differential edges g_diff = gu.networkx.difference(g_max, g_min) print len(g_max.edges()), len(g_min.edges()), len(g_diff.edges()) nodes = set() for node in g_diff.nodes(): if node in ueid_to_gene: nodes.add(node) g_sub = g_diff.subgraph(nodes) return g_sub
def analyze_network(): from toolbox import network_utilities as gu g = gu.create_network_from_sif_file( "/data/emre/toy_data/test_interactions_small.sif") degrees = g.degree(with_labels=True) node_to_values = gu.get_node_degree_related_values(g, ["v2", "v3"]) for v in g.nodes(): print v, degrees[v], node_to_values[v] gu.create_R_analyze_network_script(g, ["v2", "v3"])
def get_neighbors_of_nodes_in_network(network_file, node_file, output_file): from toolbox import network_utilities as gu g = gu.create_network_from_sif_file(network_file, use_edge_data=True) nodes = [ line.strip() for line in open(node_file) ] neighbors = [] for node in nodes: neighbors.extend(g.neighbors(node)) neighbors.extend(nodes) g_sub = g.subgraph(neighbors) f = open(output_file, 'w') for u,v,w in g_sub.edges(data=True): f.write("%s %s %s\n" % (u,w,v)) f.close() return
def main(): print("LifeArc wrapper") network_file = "/Users/woochanghwang/PycharmProjects/LifeArc/General/src_drug/Data/human_protein_interactome.sif" disease_gene_file = "/Users/woochanghwang/PycharmProjects/LifeArc/General/src_drug/Data/disease_genes.tsv" drug_target_file = "/Users/woochanghwang/PycharmProjects/LifeArc/General/src_drug/Data/drug_target_interactions.txt" network = nu.create_network_from_sif_file(network_file, use_edge_data=False, delim=None, include_unconnected=True) nodes = set(network.nodes()) print("network lengths",len(nodes)) drug_to_targets = get_drug_target_drugbank(drug_target_file, nodes=nodes) print(drug_to_targets) # disease_to_genes, disease_to_category = get_diseasome_genes(disease_gene_file, nodes=nodes) # gene_list_file = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK1_gene_score_by_RW_pvalue_FC_230119.tsv" disease_name = 'GBM' # disease_to_genes, disease_to_category = get_diseasome_genes_from_selectedGenes(gene_list_file, disease_name, # disease_category=None, nodes=nodes) # # print("network edges:", network.edges()) # output_file = "{}_drug_proximity.tsv".format(disease_name) # calculate_proximity_multiple(network, from_file=drug_target_file, to_file=gene_list_file, disease_mode=disease_name, # out_file=output_file) ####################################### ## Temp for ULK1,ULK2 ####################################### gene_list_file_ulk1 = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK1_gene_score_by_RW_pvalue_FC_230119.tsv" gene_list_file_ulk2 = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK2_gene_score_by_RW_pvalue_FC_230119.tsv" gene_list_ulk1 = get_gene_list_from_file("/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK1_gene_score_by_RW_pvalue_FC_230119.tsv") gene_list_ulk2 = get_gene_list_from_file("/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/GBM_ULK2_gene_score_by_RW_pvalue_FC_230119.tsv") gene_list = list(set(gene_list_ulk1).union(set(gene_list_ulk2))) disease_name = 'GBM' disease_to_genes , disease_to_category = get_diseasome_genes_from_selectedGenes(gene_list, disease_name, disease_category=None, nodes=nodes) print("disease_gene", disease_to_genes) print("network edges:", network.edges()) output_file = "/Users/woochanghwang/PycharmProjects/LifeArc/ULK/result/drug/{}_drug_proximity_{}.tsv".format(disease_name,"ULK1_2") calculate_proximity_multiple(network,from_file=drug_target_file, to_file=gene_list ,disease_mode = disease_name, out_file=output_file)
def case_study_pruned_networks_old(): """ cat /sbi/users/emre/data/netzcore/from_gaudi_2011/output_runs_on_random/biana_no_tap_no_reliability_pruned_p50_*/omim_breast_cancer/ns/r3i2/auc.txt > arastirma/netzcore/data/summary_runs_on_random/breast_cancer_pruned_p80.txt vi %s/"//g d<-read.table("breast_cancer_pruned_p50.txt") e<-d$V2 f<-(e-mean(e))/sd(e) which(e %in% sort(e)[98:100]) > 22 54 58 which(e %in% sort(e)[1:3]) > 7 39 55 99 """ from toolbox import network_utilities as gu from toolbox import functional_enrichment network_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/edge_scores.sif" user_entity_id_mapping_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/node_mapping.tsv.genesymbol.single" seeds_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/seed_scores.sif" network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.58" network_file_pruned2 = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.7" module_file = DATA_DIR + "module/biana_no_tap-omim/mcl/modules.txt" #network_file_permuted = DATA_DIR + "human_interactome_biana/permuted/50/sampled_graph.sif.46" ueid_to_gene = get_ues_gene_mapping(user_entity_id_mapping_file) seeds = set([line.strip().split()[0] for line in open(seeds_file)]) g = gu.create_network_from_sif_file(network_file, use_edge_data=False) g_neighborhood = gu.get_neighborhood_subgraph(g, seeds) neighborhood_edges = set(g_neighborhood.edges()) # edge node order may be different for the same edge #g_sub_pruned = gu.get_neighborhood_subgraph(g_pruned, seeds) #print len(g_sub.nodes()), len(g_sub.edges()) #print len(g_sub_pruned.nodes()), len(g_sub_pruned.edges()) g_pruned = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False) g_pruned2 = gu.create_network_from_sif_file(network_file_pruned2, use_edge_data=False) #weak_edges = set(g.edges()) - set(g_pruned.edges()) strong_edges = set(g_pruned.edges()) #strong_edges = set(g.edges()) - set(g_pruned2.edges()) weak_edges = set(g_pruned2.edges()) common_edges = weak_edges & strong_edges weak_edges -= common_edges strong_edges -= common_edges weak_edges &= neighborhood_edges strong_edges &= neighborhood_edges #print len(weak_edges), len(strong_edges), len(common_edges) g_sub = gu.create_graph() #g_sub.add_edges_from(weak_edges | strong_edges) #strong_edges = weak_edges # To check differential network from the other side (edges in min but not in max) g_sub.add_edges_from(strong_edges) weak_edges = set() go = functional_enrichment.get_go_ontology("/home/emre/arastirma/celldiff/data/GO/gene_ontology.1_2.obo") # Run scoring on pruned networks if False: from toolbox import guild_utilities data_dir = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/" executable_path = "scoreNetwork/scoreN" for network_type in ("pruned_max", "pruned_min"): if network_type == "pruned_max": network_file = network_file_pruned elif network_type == "pruned_min": network_file = network_file_pruned2 else: raise ValueError("Unknown network type!") scoring_folder = data_dir + network_type + os.sep # Create input files for scoring guild_utilities.prepare_scoring(network_file, seeds_file, scoring_folder, non_seed_score=0.01, seed_score=1.0, edge_score=1.0, n_sample=1, delim=" ", name=None) # Run GUILD and create output files, the case for Netcombo guild_utilities.run_scoring(scoring_folder, executable_path, scoring_type="netscore", parameters={"n_iteration":2, "n_repetition":3}, qname=None, name=None, calculate_pvalue=False) # Get functions of high scoring portions in 3 networks network_types = ("original", "pruned_max", "pruned_min") if False: import analyze_results association_scores_file_identifier_type = "genesymbol" node_mapping_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/node_mapping.tsv" node_mapping_file += "."+association_scores_file_identifier_type node_scores_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/node_scores.sif" for network_type in network_types: if network_type == "original": output_scores_file = DATA_DIR + "output_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/ns/r3i2/node_scores.sif" else: # network_type in ("pruned_max", "pruned_min"): output_scores_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s/output_scores.sif.netscore" % network_type enrichment_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s.txt" % network_type file_enrichment = open(enrichment_file, 'w') analyze_results.check_functional_enrichment_at_given_cutoff(output_scores_file, node_scores_file, node_mapping_file, "5%", association_scores_file_identifier_type, file_enrichment.write, 0.01, exclude_seeds=False, specie = "H**o sapiens") file_enrichment.close() # Get functions of the top scroing portions for all diseases if False: phenotype_to_functions = get_go_function_counts() seed_terms = phenotype_to_functions["omim_breast_cancer"][0] all_terms = set() common_terms = None network_to_terms = {} #network_types = network_types[:2] for network_type in network_types: enrichment_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s.txt" % network_type go_terms = functional_enrichment.get_functional_enrichment(enrichment_file, go, remove_parents=False, only_biological_processes=True, only_slim=False) network_to_terms[network_type] = set() | go_terms print network_type, len(go_terms) all_terms |= go_terms if common_terms is None: common_terms = go_terms else: common_terms &= go_terms all_terms |= seed_terms common_terms &= seed_terms #seed_terms = functional_enrichment.remove_parent_terms(seed_terms, go) #all_terms = functional_enrichment.remove_parent_terms(all_terms, go) #common_terms = functional_enrichment.remove_parent_terms(common_terms, go) print len(all_terms), len(common_terms) for network_type in network_types: for network_type2 in network_types: if network_type == network_type2: continue print network_type, network_type2, len(all_terms & network_to_terms[network_type] & network_to_terms[network_type2]) f = open(DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/functional_comparison.dat", 'w') f.write("seed terms\t%s\n" % "\t".join(network_types)) for go_term in all_terms: values = [] if go_term in seed_terms: val = 1 else: val = 0 values.append(val) for network_type in network_types: val = 0 if go_term in network_to_terms[network_type]: val= 1 values.append(val) f.write("%s\t%s\n" % (go.node[go_term]['n'], "\t".join(map(str, values)))) f.close() if False: n = float(len(seeds)) for graph in (g, g_pruned, g_pruned2): count = 0 path_length = 0 n_path = 0 n_pair = 0.0 for i, seed1 in enumerate(seeds): for j, seed2 in enumerate(seeds): if i<j: #count += len(find_all_paths(g, seed1, seed2, [])) try: paths = all_shortest_paths(graph, seed1, seed2) for path in paths: count += 1 path_length += len(path) - 1 n_pair += 1 n_path += len(path) - 1 except: continue print n, n_pair, n_path/n_pair, count, count/n, count/n_pair, path_length/float(count) return # Check seed interaction counts on pruned max if False: output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_seeds.txt" f = open(output_file, 'w') for seed in seeds: if seed not in ueid_to_gene: print seed continue f.write("%s\n" % ueid_to_gene[seed]) f.close() for network_type, graph in zip(network_types, [g, g_pruned, g_pruned2]): output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/%s_seed_interaction_counts.txt" % network_type g_neighborhood = gu.get_neighborhood_subgraph(graph, seeds) f = open(output_file, 'w') nodes = set() for node in g_neighborhood.nodes(): if node in ueid_to_gene: nodes.add(node) if node in seeds: f.write("%s\t%d\n" % (ueid_to_gene[node], g_neighborhood.degree(node))) f.close() #g_neighborhood = g_neighborhood.subgraph(nodes) #output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p0.dot" #gu.create_dot_network_file(g_neighborhood, output_file, seeds, ueid_to_gene, draw_type="all") #os.system("twopi -Tgif -O %s" % output_file) # Check modules in pruned max if False: nodes = set() for node in g_sub.nodes(): if node in ueid_to_gene: nodes.add(node) g_sub = g_sub.subgraph(nodes) output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_strong.dot" #gu.create_dot_network_file(g_sub, output_file, seeds, ueid_to_gene, weaks=weaks, draw_type="weak") #gu.create_dot_network_file(g_sub_pruned, output_file, seeds, ueid_to_gene, weaks=weaks, draw_type="weak") gu.create_dot_network_file(g_sub, output_file, seeds, ueid_to_gene, weak_edges=weak_edges, draw_type="all") os.system("twopi -Tgif -O %s" % output_file) from toolbox import mcl_utilities as mcl modules = mcl.get_modules_from_file(module_file) output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_modularized_strong.nnf" f = open(output_file, 'w') network_name = "breast_cancer_pruned_p80_" module_sets = [] included_nodes = set() included_edges = set() # Output modules for i, module in enumerate(modules): m = set(module) & nodes if len(m) > 0: module_sets.append(m) f.write("%s M%d_\n" % (network_name, i)) for u, v in g_sub.edges(m): if u == v: continue if u in m and v in m: w = "pp" if (u,v) in weak_edges or (v,u) in weak_edges: w = "weak" elif (u,v) in strong_edges or (v,u) in strong_edges: w = "strong" included_nodes.add(u) included_nodes.add(v) included_edges.add((u,v)) included_edges.add((v,u)) u = ueid_to_gene[u] v = ueid_to_gene[v] f.write("M%d_ %s %s %s\n" % (i, u, w, v)) for u in m: if u not in included_nodes: included_nodes.add(u) u = ueid_to_gene[u] f.write("M%d_ %s\n" % (i, u)) # Connect modules for i, module1 in enumerate(module_sets): for j, module2 in enumerate(module_sets): if i<j: connected_weak = False connected_strong = False for u in module1: for v in module2: if (u,v) in strong_edges: connected_strong = True break if (u,v) in weak_edges: connected_weak = True if connected_strong: f.write("%s M%d_ %s M%d_\n" % (network_name, i, "strong", j)) elif connected_weak: f.write("%s M%d_ %s M%d_\n" % (network_name, i, "weak", j)) # Output the rest for u,v in g_sub.edges(): if u == v: continue if (u,v) in included_edges: continue included_nodes.add(u) included_nodes.add(v) w = "pp" if (u,v) in weak_edges or (v,u) in weak_edges: w = "weak" elif (u,v) in strong_edges or (v,u) in strong_edges: w = "strong" u = ueid_to_gene[u] v = ueid_to_gene[v] f.write("%s %s %s %s\n" % (network_name, u,w,v)) for node in nodes - included_nodes: f.write("%s %s\n" % (network_name, ueid_to_gene[node])) f.close() output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_strong.sif" f = open(output_file, 'w') included_nodes = set() for u,v in g_sub.edges(): if u == v: continue included_nodes.add(u) included_nodes.add(v) w = "pp" if (u,v) in weak_edges or (v,u) in weak_edges: w = "weak" elif (u,v) in strong_edges or (v,u) in strong_edges: w = "strong" u = ueid_to_gene[u] v = ueid_to_gene[v] f.write("%s %s %s\n" % (u,w,v)) for node in nodes - included_nodes: f.write("%s\n" % ueid_to_gene[node]) f.close() for i, module in enumerate(module_sets): output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/M%d_.txt" % i #g_sub_sub = g_sub.subgraph(module) # For checking the functions enriched in the largest connected component of the module #module = gu.get_connected_components(g_sub_sub, return_as_graph_list=False)[0] f = open(output_file, 'w') for node in module: f.write("%s\n" % ueid_to_gene[node]) f.close() functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file, output_file+".funcassoc") if False: for i in range(4): enrichment_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/" + "%s/M%d_.txt.funcassoc" % ("strong/func-all", i) # strong/func-all strong weak go_terms = functional_enrichment.get_functional_enrichment(enrichment_file, go, remove_parents=False, only_biological_processes=True) print "m%d<-c(\"%s\")" % (i, "\", \"".join(go_terms)) return
def case_study_pruned_networks(): from toolbox import network_utilities as gu from toolbox import functional_enrichment from toolbox import mcl_utilities as mcl from scipy.stats import hypergeom network_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/edge_scores.sif" user_entity_id_mapping_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/node_mapping.tsv.genesymbol.single" seeds_file = DATA_DIR + "input_runs_for_draft/biana_no_tap_no_reliability/omim_breast_cancer/seed_scores.sif" auc_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80.txt" module_file = DATA_DIR + "module/biana_no_tap-omim/mcl/modules.txt" # Get node mapping ueid_to_gene = get_ues_gene_mapping(user_entity_id_mapping_file) # Get seeds seeds = set([line.strip().split()[0] for line in open(seeds_file)]) # Get neighborhood in the original network g_org = gu.create_network_from_sif_file(network_file, use_edge_data=False) g_neighborhood = gu.get_neighborhood_subgraph(g_org, seeds) #neighborhood_edges = set(g_neighborhood.edges()) # edge node order may be different for the same edge critical_auc = 0.634 g_sub = get_differential_network(g_org, ueid_to_gene, auc_file, critical_auc) # Get seed GOs to check their coverage in top connected component phenotype_to_functions = get_go_function_counts() seed_terms = phenotype_to_functions["omim_breast_cancer"][0] go = functional_enrichment.get_go_ontology("/home/emre/arastirma/celldiff/data/GO/gene_ontology.1_2.obo") # Get network genes network_genes = set() # set(ueid_to_gene.values()) seed_genes = set() for node in g_org.nodes(): if node in ueid_to_gene: network_genes.add(ueid_to_gene[node]) if node in seeds: seed_genes.add(ueid_to_gene[node]) # Get current (up-to-date) seed GO terms output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/seed_genes.txt" #functional_enrichment.check_functional_enrichment(list(seed_genes), list(network_genes), "genesymbol", open(output_file+".funcassoc", 'w').write) #seed_go_terms = functional_enrichment.get_functional_enrichment(output_file + ".funcassoc", go, remove_parents=False, only_biological_processes=True) #print "current seed go:", len(seed_go_terms) # Get all functions enriched in the network output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/network_genes.txt" f = open(output_file, 'w') [ f.write("%s\n" % gene) for gene in network_genes ] f.close() #functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file, output_file+".funcassoc") network_go_terms = functional_enrichment.get_functional_enrichment(output_file + ".funcassoc", go, remove_parents=False, only_biological_processes=True) print 23928, len(network_go_terms) # Check the functions enriched in the largest connected component of each module for i, module in enumerate(gu.get_connected_components(g_sub, return_as_graph_list=False)): if len(module) < 10: continue output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/M%d" % i if i == 0: f = open(output_file + ".txt.seeds", 'w') f2 = open(output_file + ".txt.nonseeds", 'w') module_seed_genes = set() module_nonseed_genes = set() for node in module: if True: #node in ueid_to_gene: if node in seeds: f.write("%s\n" % ueid_to_gene[node]) module_seed_genes.add(ueid_to_gene[node]) else: f2.write("%s\n" % ueid_to_gene[node]) module_nonseed_genes.add(ueid_to_gene[node]) f.close() f2.close() ##functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file + "seeds.txt", output_file + "seeds.txt.funcassoc") functional_enrichment.check_functional_enrichment(list(module_seed_genes), list(network_genes), "genesymbol", open(output_file+".txt.seeds.funcassoc", 'w').write) ##functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file + "nonseeds.txt", output_file + "nonseeds.txt.funcassoc") functional_enrichment.check_functional_enrichment(list(module_nonseed_genes), list(network_genes), "genesymbol", open(output_file+".txt.nonseeds.funcassoc", 'w').write) go_terms_seeds = functional_enrichment.get_functional_enrichment(output_file + ".txt.seeds.funcassoc", go, remove_parents=False, only_biological_processes=True) go_terms_nonseeds = functional_enrichment.get_functional_enrichment(output_file + ".txt.nonseeds.funcassoc", go, remove_parents=False, only_biological_processes=True) print len(seed_terms), len(go_terms_nonseeds), len(seed_terms & go_terms_nonseeds), len(seed_terms & go_terms_nonseeds) / float(len(seed_terms)) print "p_value:", sum(hypergeom.pmf(range(len(seed_terms & go_terms_nonseeds),len(go_terms_nonseeds)+1), len(network_go_terms), len(seed_terms), len(go_terms_nonseeds))) # Draw diff network component weak_edges = set() g_sub_sub = g_sub.subgraph(module) gu.create_dot_network_file(g_sub_sub, output_file + ".dot", seeds, ueid_to_gene, weak_edges=weak_edges, draw_type="all") gu.output_network_in_sif(g_sub_sub, output_file + ".sif", ueid_to_gene, delim = " ", include_unconnected=True) os.system("fdp -Tgif -O %s" % (output_file + ".dot")) # Get functions f = open(output_file + ".txt", 'w') module_genes = set() for node in module: f.write("%s\n" % ueid_to_gene[node]) module_genes.add(ueid_to_gene[node]) f.close() ##functional_enrichment.check_functional_enrichment_of_human_gene_symbols(output_file + ".txt", output_file + ".txt.funcassoc") functional_enrichment.check_functional_enrichment(list(module_genes), list(network_genes), "genesymbol", open(output_file+".txt.funcassoc", 'w').write) go_terms = functional_enrichment.get_functional_enrichment(output_file + ".txt.funcassoc", go, remove_parents=False, only_biological_processes=True) print len(seed_terms), len(go_terms), len(seed_terms & go_terms), len(seed_terms & go_terms) / float(len(seed_terms)) print "p_value:", sum(hypergeom.pmf(range(len(seed_terms & go_terms),len(go_terms)+1), len(network_go_terms), len(seed_terms), len(go_terms))) weak_edges = set() strong_edges = g_sub.edges() # Draw diff neighborhood network output_file = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/breast_cancer_pruned_p80_diff" gu.create_dot_network_file(g_sub, output_file + ".dot", seeds, ueid_to_gene, weak_edges=weak_edges, draw_type="all") gu.output_network_in_sif(g_sub, output_file + ".sif", ueid_to_gene, delim = " ", include_unconnected=True) os.system("fdp -Tgif -O %s" % (output_file + ".dot")) all_terms = seed_terms #(go_terms_seeds | seed_terms | go_terms_nonseeds) file_name = DATA_DIR + "summary_runs_on_random/breast_cancer_pruned/functional_comparison" functional_enrichment.output_go_terms_and_levels(all_terms, go, file_name+"_goids.dat") f = open(file_name+".dat", 'w') #f.write("GO id\tGO term\tAll seeds\tModule seeds\tModule non-seeds\n") #"seed GO terms\tmodule GO terms (w/out seeds)\n" f.write("GO id\tGO term\tModule seeds\tModule non-seeds\n") #"seed GO terms\tmodule GO terms (w/out seeds)\n" term_list = [go_terms_seeds, go_terms_nonseeds] #[seed_terms, go_terms_seeds, go_terms_nonseeds] for go_term in all_terms: values = [] for terms in term_list: if go_term in terms: val = 1 else: val = 0 values.append(val) f.write("%s\t%s\t%s\n" % (go_term, go.node[go_term]['n'], "\t".join(map(str, values)))) f.close() return
def get_differential_network_using_all_networks(g_org, ueid_to_gene, auc_file, critical_auc): from toolbox import network_utilities as gu from statsmodels.stats.weightstats import ttest_ind # Get indices of min/max networks aucs = [] for line in open(auc_file): aucs.append(float(line.split()[1])) indices_max = [] indices_min = [] for i, auc in enumerate(aucs): if auc >= critical_auc: indices_max.append(i) else: indices_min.append(i) # Get max neighborhood network g_maxs = [ ] for i in indices_max: network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1) g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False) g_maxs.append(g) # Get min neighborhood network g_mins = [ ] for i in indices_min: network_file_pruned = DATA_DIR + "human_interactome_biana/pruned/omim_breast_cancer/80/sampled_graph.sif.%d" % (i+1) g = gu.create_network_from_sif_file(network_file_pruned, use_edge_data=False) g_mins.append(g) print len(g_maxs), len(g_mins) #f=open("test.dat", 'w') #f.write("stat pval\n") g_diff = networkx.Graph() for u, v in g_org.edges(): if u == v: continue values_max = [] values_min = [] for i, g in enumerate(g_maxs): if g.has_edge(u,v): values_max.append(1) else: values_max.append(0) for i, g in enumerate(g_mins): if g.has_edge(u,v): values_min.append(1) else: values_min.append(0) vals = ttest_ind(values_max, values_min, usevar="separate") stat, pval = vals[:2] #f.write("%s.%s %s %s\n" % (u, v, stat, pval)) if pval <= 0.05: if stat > 0: g_diff.add_edge(u, v) #f.close() nodes = set() for node in g_diff.nodes(): if node in ueid_to_gene: nodes.add(node) g_sub = g_diff.subgraph(nodes) return g_sub
def prepare_scoring(network_file, seed_file, scoring_folder="./", non_seed_score=0.01, seed_score=1.0, edge_score=1.0, n_sample=100, delim=" ", name=None): """ Creates input files required by GUILD executable. network_file: network in sif-like format where edge type is edge score (e.g., "A 0.5 B" or "A pp B") seed_file: seeds in text format where nodes and their scores are given (e.g., "A 0.1" or "A") scoring_folder: path to directory where the input/output files will be created non_seed_score: initial scores of non-seeds (0.01, by default) seed_score: initial scores of seeds (1.0, by default) edge_score: weight of edges, in case the values in network_file is not convertable to float (1.0, by default) n_sample: number of randomly generated graphs for netzcore (100, by default) delim: delimiter that separates columns in input/output files (" ", by default) name: optional name defining the phenotype, the scoring files will created under this dir (in case of multiple phenotype analysis) """ if not os.path.exists(scoring_folder): os.mkdir(scoring_folder) if name is not None: if not os.path.exists(scoring_folder + name): os.mkdir(scoring_folder + name) name += os.sep else: name = "" # Read node info from network file (use network file as edge file) print "Creating edge score file" edge_score_file = scoring_folder + "edge_scores.sif" #network_file.split("/")[-1] + ".converted" if os.path.exists(edge_score_file): print "\tEdge score file exists, overwriting!" nodes, edges, dummy, edge_to_data = network_utilities.get_nodes_and_edges_from_sif_file( network_file, store_edge_type=True, delim=delim, data_to_float=False) edge_to_weight = create_edge_score_file(edge_score_file, edges, edge_to_data, edge_score, delim) # Create node file (ignore seeds that are not in the network and assign non-seed scores) print "Creating node score file" node_score_file = scoring_folder + name + "node_scores.sif" #seed_file.split("/")[-1] + ".converted" seed_score_file = scoring_folder + name + "seed_scores.sif" seeds, dummy, seed_to_data, dummy = network_utilities.get_nodes_and_edges_from_sif_file( seed_file, store_edge_type=False, delim=delim, data_to_float=True) if seed_to_data is None: seed_to_data = {} for seed in seeds: seed_to_data[seed] = seed_score node_to_data = create_node_score_file(node_score_file, seed_score_file, nodes, seeds, seed_to_data, non_seed_score, seed_score, delim) # Create background node file (selects k non-seeds randomly where k is the number of seeds) print "Creating background node score file" bg_node_file = scoring_folder + name + "node_scores_background.sif" #seed_file.split("/")[-1] + ".converted" bg_seed_file = scoring_folder + name + "seed_scores_background.sif" create_background_score_file(bg_node_file, bg_seed_file, nodes, seeds, seed_to_data, non_seed_score, delim) # Create modified edge file using node scores for netshort print "Creating node score converted edge file (for netshort)" nd_edge_file = scoring_folder + name + "edge_scores_netshort.sif" #network_file.split("/")[-1] + ".converted_for_netshort" create_node_score_converted_edge_score_file(nd_edge_file, edges, edge_to_weight, node_to_data, delim) # Create random network files for netzcore print "Creating random networks (for netzcore)" sampling_prefix = scoring_folder + "../" + "sampled_graph." if os.path.exists(sampling_prefix + "%s" % n_sample): print "\tSampled networks exists, skipping this step!" else: g = network_utilities.create_network_from_sif_file( network_file_in_sif=edge_score_file, use_edge_data=True, delim=delim) for i in xrange(1, n_sample + 1): g_sampled = network_utilities.randomize_graph( graph=g, randomization_type="preserve_topology_and_node_degree") network_utilities.output_network_in_sif(g_sampled, sampling_prefix + "%s" % i) return
def main_pree(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--network_file') #, required=True) parser.add_argument('-s', '--nodes_from') #, required=True) parser.add_argument('-t', '--nodes_to') #, required=True) parser.add_argument('-o', '--out_file') #, required=True) parser.add_argument('-n', '--n_random', type=int, default=1000) parser.add_argument('-m', '--min_bin_size', type=int, default=100) parser.add_argument('-x', '--n_seed', type=int, default=452456) parser.add_argument('-f', '--parameter_file', type=str, default=None) parser.add_argument('-p', '--parameter_file_prefix', type=str, default=None) parser.add_argument('-i', '--parameter_file_start_index', type=int, default=None) parser.add_argument('-j', '--parameter_file_end_index', type=int, default=None) args = parser.parse_args() # Run more than once for given input files if args.parameter_file_prefix is not None: parameter_file_prefix = args.parameter_file_prefix i_start = args.parameter_file_start_index i_end = args.parameter_file_end_index calculate_proximity_multiple(parameter_file_prefix, i_start, i_end) return # # Run from input parameter file # elif args.parameter_file_prefix is not None: # network_file, nodes_from, nodes_to, out_file, min_bin_size, n_random, n_seed = get_parameters_from_file( # args.parameter_file_prefix + "%s.txt" % 'n') # Run once with provided arguments else: nodes_from = args.nodes_from.split(",") nodes_to = args.nodes_to.split(",") network_file = args.network_file n_random = args.n_random min_bin_size = args.min_bin_size n_seed = args.n_seed out_file = args.out_file network = network_util.create_network_from_sif_file( network_file, use_edge_data=False, delim=None, include_unconnected=True) # print args print(network_file, nodes_from, nodes_to, n_random, min_bin_size, n_seed, out_file) values = wrappers.calculate_proximity(network, nodes_from=nodes_from, nodes_to=nodes_to, n_random=n_random, min_bin_size=min_bin_size, seed=n_seed) if values is not None: # not in network d, z, (m, s) = values # print z, d, (m, s) open(out_file, 'w').write("%f %f %f %f\n" % (z, d, m, s)) return
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--network_file') #, required=True) parser.add_argument('-s', '--nodes_from') #, required=True) parser.add_argument('-t', '--nodes_to') #, required=True) parser.add_argument('-d', '--disease_mode') # , required=True) parser.add_argument('-o', '--out_file') #, required=True) parser.add_argument('-n', '--n_random', type=int, default=1000) parser.add_argument('-m', '--min_bin_size', type=int, default=100) parser.add_argument('-x', '--n_seed', type=int, default=452456) parser.add_argument('-f', '--parameter_file', type=str, default=None) parser.add_argument('-p', '--parameter_file_prefix', type=str, default=None) parser.add_argument('-i', '--parameter_file_start_index', type=int, default=None) parser.add_argument('-j', '--parameter_file_end_index', type=int, default=None) args = parser.parse_args() # Run more than once for given input files network = network_util.create_network_from_sif_file( args.network_file, use_edge_data=False, delim=None, include_unconnected=True) wrappers.calculate_proximity_multiple(network, from_file=args.nodes_from, to_file=args.nodes_to, disease_mode=args.disease_mode, out_file=args.out_file) ########################################### # network_file = "../src_drug/Data/human_protein_interactome.sif" # nodes_from = "../src_drug/scratch/drug_target_interaction_temp_1.txt" # nodes_to = "../../ULK/result/GBM_ULK1_2_gene_score_by_RW_pvalue_FC_230119.tsv" # disease_name = "GBM" # # output_file = "../src_drug/Result/{}_drug_proximity_t_1_1.tsv".format(disease_name) # # # network = network_util.create_network_from_sif_file(network_file, use_edge_data=False, delim=None, # include_unconnected=True) # wrappers.calculate_proximity_multiple(network, from_file=nodes_from, to_file=nodes_to, # disease_mode=disease_name, out_file=output_file) ########################################### # if args.parameter_file_prefix is not None: # parameter_file_prefix = args.parameter_file_prefix # i_start = args.parameter_file_start_index # i_end = args.parameter_file_end_index # calculate_proximity_multiple(parameter_file_prefix, i_start, i_end) # return # # # Run from input parameter file # # elif args.parameter_file_prefix is not None:min_bin_size # # network_file, nodes_from, nodes_to, out_file, , n_random, n_seed = get_parameters_from_file( # # args.parameter_file_prefix + "%s.txt" % 'n') # # Run once with provided arguments # else: # nodes_from = args.nodes_from.split(",") # nodes_to = args.nodes_to.split(",") # network_file = args.network_file # n_random = args.n_random # min_bin_size = args.min_bin_size # n_seed = args.n_seed # out_file = args.out_file # network = network_util.create_network_from_sif_file(network_file, use_edge_data=False, delim=None, # include_unconnected=True) # # print args # print(network_file, nodes_from, nodes_to, n_random, min_bin_size, n_seed, out_file) # values = wrappers.calculate_proximity(network, nodes_from=nodes_from, nodes_to=nodes_to, n_random=n_random, # min_bin_size=min_bin_size, seed=n_seed) # if values is not None: # not in network # d, z, (m, s) = values # # print z, d, (m, s) # open(out_file, 'w').write("%f %f %f %f\n" % (z, d, m, s)) return