def create_sparse_net_file(out_pref, net_files=[], string_net_files=[], string_nets=STRING_NETWORKS, string_cutoff=None, forcenet=False): if net_files is None: net_files = [] # if there aren't any string net files, then set the string nets to empty if len(string_net_files) == 0: string_nets = [] # if there are string_net_files, and string_nets is None, set it back to its default elif string_nets is None: string_nets = STRING_NETWORKS string_nets = list(string_nets) num_networks = len(net_files) + len(string_nets) # if there is only 1 string network, then write the name instead of the number if len(string_nets) == 1: num_networks = list(string_nets)[0] sparse_nets_file = "%s%s-sparse-nets.mat" % (out_pref, num_networks) # the node IDs should be the same for each of the networks, # so no need to include the # in the ids file node_ids_file = "%snode-ids.txt" % (out_pref) net_names_file = "%s%s-net-names.txt" % (out_pref, num_networks) if forcenet is False \ and os.path.isfile(sparse_nets_file) and os.path.isfile(node_ids_file) \ and os.path.isfile(net_names_file): # read the files print("\treading sparse nets from %s" % (sparse_nets_file)) sparse_networks = list(loadmat(sparse_nets_file)['Networks'][0]) print("\treading node ids file from %s" % (node_ids_file)) nodes = utils.readItemList(node_ids_file, 1) print("\treading network_names from %s" % (net_names_file)) network_names = utils.readItemList(net_names_file, 1) else: print("\tcreating sparse nets and writing to %s" % (sparse_nets_file)) sparse_networks, network_names, nodes = setup_sparse_networks( net_files=net_files, string_net_files=string_net_files, string_nets=string_nets, string_cutoff=string_cutoff) # now write them to a file write_sparse_net_file(sparse_networks, sparse_nets_file, network_names, net_names_file, nodes, node_ids_file) return sparse_networks, network_names, nodes
'default outputs/version/weighted/plots/edge-weights/edge-weight-dist-version.png.' ) #parser.add_option('-o', '--out-file', type='string', default="viz/assays/zscore-rectfs-assays.png", # help='path/to/output_file.png. Default:') parser.add_option('', '--pdf', action='store_true', help='Also store a pdf of the figures') (opts, args) = parser.parse_args() for i, version in enumerate(opts.version): print("") print("-" * 30) t_settings.set_version(version) chemicals = sorted( utils.readItemList("%s/chemicals.txt" % (t_settings.INPUTSPREFIX))) interactome = t_settings.INTERACTOME print( "Getting the weight, cost and direction of each edge from the interactome %s" % (interactome)) lines = utils.readColumns(interactome, 1, 2, 3, 4) edge_weights = {(u, v): float(w) for u, v, w, d in lines} edge_dir = {(u,v): True if d.lower() in ['true','t','dir','directed'] else False \ for u,v,w,d in lines} # get the evidence from get_interaction_evidence.py #evidence_file = getev.getEvidenceFile(evidence_version, t_settings.DATADIR) #edge_dir = getev.getEdgeDir(edge_weights.keys(), evidence_file, split_family_nodes=False, add_ev_to_family_edges=False) # TODO try just the directed edges
# parse the command line arguments (opts, args) = parser.parse_args() return opts opts = parseArguments() #if __name__ != "__main__": # os.chdir("/data/jeff-law/projects/2016-02-toxcast/") for version in opts.version: print("") print("-"*30) t_settings.set_version(version) chemicals = sorted(readItemList("%s/chemicals.txt" % (t_settings.INPUTSPREFIX))) #sig_chemicals = utils.readItemList("inputs/%s/sig-chemicals.txt" % (version), 1) #unsig_chemicals = set(chemicals).difference(set(sig_chemicals)) #summary_file = "outputs/%s/weighted/stats/network-summaries.csv" % (version) df = summary_stats.get_summary_stats(version=version, forced=opts.forced) #df = t_utils.get_summary_stats(version=version, forced=True) # loop through the chemicals, significant chemicals and unsignificant chemicals for chemicals, postfix in [(chemicals, '')]: if opts.out_file is None: out_file_name = "summary-network-stats-%s.png" % (version) out_dir = "%s/plots/summary-stats/" % (t_settings.RESULTSPREFIX) t_utils.checkDir(out_dir) out_file = "%s/%s" % (out_dir, out_file_name) # if specified, copy the file to the compare versions dir
def load_net_ann_datasets(out_dir, taxon, dataset, input_settings, alg_settings, uniprot_taxon_file, **kwargs): sparse_net_file = "%s/%s-net.npz" % (out_dir, taxon) node2idx_file = sparse_net_file + "-node-ids.txt" swsn_weights_file = sparse_net_file + "-swsn-weights.txt" sparse_ann_file = "%s/ann.npz" % (out_dir) if not kwargs.get('forcenet') and \ (os.path.isfile(sparse_net_file) and os.path.isfile(node2idx_file)) and \ os.path.isfile(sparse_ann_file): print("Reading network from %s" % (sparse_net_file)) W = sp.load_npz(sparse_net_file) print("\t%d nodes and %d edges" % (W.shape[0], len(W.data) / 2)) print("Reading node names from %s" % (node2idx_file)) prots = utils.readItemList(node2idx_file, 1) new_net_obj = setup.Sparse_Networks(W, prots) if os.path.isfile(swsn_weights_file): print("Reading swsn weights file %s" % (swsn_weights_file)) weights = [ float(w) for w in utils.readItemList(swsn_weights_file, 1) ] # also load the original networks to get the edge weights for the STRING networks net_obj = run_eval_algs.setup_net(input_settings['input_dir'], dataset, **kwargs) net_obj.swsn_weights = weights else: net_obj = new_net_obj print("\nReading annotation matrix from %s" % (sparse_ann_file)) loaded_data = np.load(sparse_ann_file, allow_pickle=True) dag_matrix = setup.make_csr_from_components(loaded_data['arr_0']) ann_matrix = setup.make_csr_from_components(loaded_data['arr_1']) goids, prots = loaded_data['arr_2'], loaded_data['arr_3'] ann_obj = setup.Sparse_Annotations(dag_matrix, ann_matrix, goids, prots) species_to_uniprot_idx = eval_loso.get_uniprot_species( uniprot_taxon_file, ann_obj) # TODO eval ann obj eval_ann_obj = None else: # load the network # TODO if a subset of the network was run, need to get that subset net_obj, ann_obj, eval_ann_obj = run_eval_algs.setup_dataset( dataset, input_settings['input_dir'], alg_settings, **kwargs) species_to_uniprot_idx = eval_loso.get_uniprot_species( uniprot_taxon_file, ann_obj) new_net_obj = net_obj # run SWSN if needd #if net_obj.multi_net: # TODO if LOSO was run, need to leave out the taxon for edge weights to be accurate if taxon is not None: if kwargs.get('limit_to_taxons_file'): # limit the network to the specified species # read in the specified taxons from the file _, net_taxons = eval_loso.get_selected_species( species_to_uniprot_idx, kwargs['limit_to_taxons_file']) net_taxon_prots = net_exp.get_taxon_prots( net_obj.nodes, net_taxons, species_to_uniprot_idx) net_obj, ann_obj = net_exp.limit_to_taxons(net_taxon_prots, net_obj=net_obj, ann_obj=ann_obj, **kwargs) # leave out the annotations for this taxon ID train_ann_mat, test_ann_mat, sp_goterms = eval_loso.leave_out_taxon( taxon, ann_obj, species_to_uniprot_idx, eval_ann_obj=eval_ann_obj, **kwargs) taxon_prots = net_exp.get_taxon_prots(net_obj.nodes, [taxon], species_to_uniprot_idx) new_net_obj = net_exp.limit_net_to_target_taxon( train_ann_mat, taxon_prots, net_obj, ann_obj, **kwargs) W = new_net_obj.W # else: # W, _ = net_obj.weight_SWSN(ann_obj.ann_matrix) # #new_net_obj = else: W = net_obj.W print("\twriting sparse matrix to %s" % (sparse_net_file)) sp.save_npz(sparse_net_file, W) print("\twriting node2idx labels to %s" % (node2idx_file)) with open(node2idx_file, 'w') as out: out.write(''.join([ "%s\t%d\n" % (prot, i) for i, prot in enumerate(net_obj.nodes) ])) if net_obj.multi_net: print("\twriting swsn weights file to %s" % (swsn_weights_file)) with open(swsn_weights_file, 'w') as out: out.write('\n'.join([str(w) for w in new_net_obj.swsn_weights]) + '\n') net_obj.swsn_weights = new_net_obj.swsn_weights # now store them to a file print("\twriting sparse annotations to %s" % (sparse_ann_file)) # store all the data in the same file dag_matrix_data = setup.get_csr_components(ann_obj.dag_matrix) ann_matrix_data = setup.get_csr_components(ann_obj.ann_matrix) #np.savez_compressed( # sparse_ann_file, dag_matrix_data=dag_matrix_data, # ann_matrix_data=ann_matrix_data, goids=goids, prots=prots) np.savez_compressed(sparse_ann_file, dag_matrix_data, ann_matrix_data, ann_obj.goids, ann_obj.prots) return net_obj, new_net_obj, ann_obj, eval_ann_obj, species_to_uniprot_idx
def get_summary_stats(version="2018_01-toxcast-d2d-p1_5-u1_25", summary_file="network_summaries.csv", scope="permute-dir-undir", forced=False): """ Function to aggregate summary statistics for every network returns a dataframe containing the counted metrics for each chemical """ TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[version]) #inputs_dir = "inputs/%s/" % (version) t_settings.set_version(version) inputs_dir = t_settings.INPUTSPREFIX outputs_dir = "outputs/%s/weighted" % (version) chemicals = utils.readItemList("%s/chemicals.txt" % (inputs_dir), 1) #hits_template = "%s/hit-prots/%%s-hit-prots.txt" % (inputs_dir) #nonhits_template = "%s/hit-prots/%%s-nonhit-prots.txt" % (inputs_dir) #rec_tfs_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (inputs_dir) chem_rec, chem_tfs = TOXCAST_DATA.chemical_rec, TOXCAST_DATA.chemical_tfs chem_prot_hit_vals = TOXCAST_DATA.chemical_protein_hit paths_dir = "%s/edgelinker" % (outputs_dir) paths_template = "%s/%%s-paths.txt" % (paths_dir) out_dir = "%s/stats/summary" % outputs_dir t_utils.checkDir(out_dir) summary_file = "%s/%s" % (out_dir, summary_file) if os.path.isfile(summary_file) and not forced: print( "Reading network summary stats from '%s'. Set forced to True to overwrite it." % (summary_file)) df = pd.read_csv(summary_file, index_col=0) else: print("Reading in the stats from the response networks in", paths_dir) chemical_names, chemical_name_to_id = t_utils.getChemicalNameMaps() chemical_names = { chemical: chemical_names[chemical] for chemical in chemicals } chemical_prots = {} chemical_num_paths = {} chemical_num_edges = {} chemical_avg_path_lengths = {} chemical_rec = {} chemical_tfs = {} chemical_net_rec = {} chemical_net_tfs = {} chemical_hits = {} chemical_nonhits = {} chemical_net_hits = {} chemical_net_nonhits = {} chemical_inter_hits = {} chemical_inter_nonhits = {} chemical_inter_net_hits = {} chemical_inter_net_nonhits = {} # also get the q-value for each chemical chemical_pvals = {} pvals_file = "%s/stats/stat-sig-%s/gpd-pval.txt" % (outputs_dir, scope) if os.path.isfile(pvals_file): with open(pvals_file, 'r') as file_handle: header = file_handle.readline().rstrip().split('\t') pval_col = header.index("200") + 1 chemical_pvals = { chem: pval for chem, pval in utils.readColumns(pvals_file, 1, pval_col) } chemical_qvals = {} qvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % ( outputs_dir, scope) if os.path.isfile(qvals_file): chemical_qvals = t_utils.getPvals(outputs_dir, scope, sig_cutoff_type="FDR") for chemical in tqdm(chemicals): #prots, paths = getProteins(paths=paths_template % chemical, max_k=200, ties=True) paths = t_utils.getPaths(paths_template % chemical, max_k=200, ties=True) prots = set() num_paths = len(paths) edges = set() path_lengths = [] for path in paths: path = path.split('|') # path length is the number of edges in a path path_lengths.append(len(path) - 1) prots = prots.union(set(path)) for i in range(len(path) - 1): edges.add((path[i], path[i + 1])) chemical_prots[chemical] = len(prots) chemical_num_paths[chemical] = len(paths) chemical_avg_path_lengths[chemical] = np.mean(path_lengths) chemical_num_edges[chemical] = len(edges) #rec, tfs = t_utils.getRecTFs(rec_tfs_template % chemical) rec, tfs = chem_rec[chemical], chem_tfs[chemical] chemical_rec[chemical] = len(rec) chemical_tfs[chemical] = len(tfs) chemical_net_rec[chemical] = len(prots.intersection(rec)) chemical_net_tfs[chemical] = len(prots.intersection(tfs)) # read the hits and nonhits for each chemical to calculate how many of them are in the network #hits = utils.readItemSet(hits_template % chemical, 1) #nonhits = utils.readItemSet(nonhits_template % chemical, 1) hits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \ if hit_val == 1]) nonhits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \ if hit_val == 0]) chemical_hits[chemical] = len(hits) chemical_nonhits[chemical] = len(nonhits) chemical_net_hits[chemical] = len(hits.intersection(prots)) chemical_net_nonhits[chemical] = len(nonhits.intersection(prots)) # subtract the rec and tfs to get just the intermediate hits and nonhits chemical_inter_hits[chemical] = len(hits.difference( rec.union(tfs))) chemical_inter_nonhits[chemical] = len( nonhits.difference(rec.union(tfs))) chemical_inter_net_hits[chemical] = len( hits.intersection(prots).difference(rec.union(tfs))) chemical_inter_net_nonhits[chemical] = len( nonhits.intersection(prots).difference(rec.union(tfs))) # write these metrics to a file df = pd.DataFrame({ "name": chemical_names, "prots": chemical_prots, "num_paths": chemical_num_paths, "pvals": chemical_pvals, "qvals": chemical_qvals, "num_edges": chemical_num_edges, "avg_path_lengths": chemical_avg_path_lengths, "net_rec": chemical_net_rec, "net_tfs": chemical_net_tfs, "hit_rec": chemical_rec, "hit_tfs": chemical_tfs, "net_hits": chemical_net_hits, "net_nonhits": chemical_net_nonhits, 'hits': chemical_hits, 'nonhits': chemical_nonhits, "inter_net_hits": chemical_inter_net_hits, "inter_net_nonhits": chemical_inter_net_nonhits, "inter_hits": chemical_inter_hits, "inter_nonhits": chemical_inter_nonhits, }) print("Writing: ", summary_file) df.to_csv(summary_file, header=True, columns=[ 'name', 'prots', 'num_paths', 'num_edges', 'avg_path_lengths', 'hits', 'nonhits', 'net_hits', 'net_nonhits', 'hit_rec', 'hit_tfs', 'net_rec', 'net_tfs', 'inter_net_hits', 'inter_net_nonhits', 'inter_hits', 'inter_nonhits', 'pvals', 'qvals' ]) # change the index or chemical id to unicode (string) #df.index = df.index.map(unicode) return df
def permute_and_run_edgelinker(opts, random_index): if opts.write_score_counts: rand_scores_k = "%s/rand-networks/rand-%d-med-scores-k.txt" % ( opts.write_score_counts, random_index) # if the final score counts file already exists, then don't do anything if os.path.isfile(rand_scores_k) and not opts.forced: print("%s already exists. Skipping." % (rand_scores_k)) return chemical_k_scores = "%s/chemical-k-median-scores.txt" % ( opts.write_score_counts) if not os.path.isfile(chemical_k_scores): print( "Error: %s does not exist. Run compute_stat_sig.py with the --write-counts option to write it. Quitting" % (chemical_k_scores)) return t_utils.checkDir("%s/networks" % (opts.out_dir)) rec_tfs_file_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (opts.inputs_dir) chemicals = sorted( utils.readItemList("%s/chemicals.txt" % opts.inputs_dir, col=1)) if opts.single_chem: chemicals = opts.single_chem if opts.permute_rec_tfs is not None: # if specified, "permute" the sets of receptors and tfs for each chemical instead of the interactome print("Writing random sets of rec/tfs for each chemical to %s" % (opts.out_dir)) rec_tfs_file_template = "%s/%%s/%d-random-rec-tfs.txt" % (opts.out_dir, random_index) all_rec, all_tfs = t_utils.getRecTFs(opts.permute_rec_tfs) #chemical_num_rectfs_file = "%s/chemical_num_rectfs.txt" % (opts.inputs_dir) #lines = utils.readColumns(chemical_num_rectfs_file, 2, 3, 4) #for chem, num_rec, num_tfs in tqdm(lines): for chemical in tqdm(chemicals, disable=opts.verbose): out_file = rec_tfs_file_template % (chemical) if not os.path.isfile(out_file) or opts.forced: rec, tfs, costs, zscores = t_utils.getRecTFs( t_settings.REC_TFS_FILE % (opts.inputs_dir, chemical), costs=True) rec = list(rec) tfs = list(tfs) out_dir = "%s/%s" % (opts.out_dir, chemical) t_utils.checkDir(out_dir) random_rec = random.sample(all_rec, len(rec)) # apply the costs to the random rec and tfs for i in range(len(rec)): costs[random_rec[i]] = costs[rec[i]] zscores[random_rec[i]] = zscores[rec[i]] random_tfs = random.sample(all_tfs, len(tfs)) for i in range(len(tfs)): costs[random_tfs[i]] = costs[tfs[i]] zscores[random_tfs[i]] = zscores[tfs[i]] t_utils.writeRecTFs(out_file, random_rec, random_tfs, costs=costs, zscores=zscores) # use the original interactome permuted_network_out_file = opts.interactome print("Using the original interactome %s" % (permuted_network_out_file)) else: # default is to permute the interactome permuted_network_out_file = '%s/networks/permuted-network%d.txt' % ( opts.out_dir, random_index) if not os.path.isfile(permuted_network_out_file) or opts.forced: # don't log transform. The weights will be log transformed by the edgelinker code #G = cycLinker.readNetwork(opts.interactome, weight=True, logtransform=False) # UPDATE: 2017-12-07: try using the direction of the edges from the fourth column of the interactome instead of splitting based on if the edge is bidirected or not G = nx.DiGraph() dir_edges = [] undir_edges = [] lines = utils.readColumns(opts.interactome, 1, 2, 3, 4) if len(lines) == 0: print( "ERROR: interactome should have 4 columns: a, b, w, and True/False for directed/undirected. Quitting" ) sys.exit() for u, v, w, directed in lines: G.add_edge(u, v, weight=float(w)) if directed.lower() in ["true", "t", "dir", 'directed']: dir_edges.append((u, v)) elif directed.lower() not in [ "false", 'f', 'undir', 'undirected' ]: print( "ERROR: Unknown directed edge type '%s'. 4th column should be T/F to indicdate directed/undirected" % (directed.lower())) print("Quitting.") sys.exit() elif u < v: undir_edges.append((u, v)) if opts.undirected: # swap all edges as undirected edges permG = permute_network.permute_network( G.to_undirected(), num_iterations=opts.num_iterations) permG = permG.to_directed() elif opts.split_by_weight: # split the edges into bins by weight and swap the directed and undirected edges separately # if specified by the user permG = permute_network.permute_network( G, swap_phys_sig_sep=opts.swap_phys_sig_sep, split_weight=opts.split_by_weight, num_iterations=opts.num_iterations) elif opts.swap_phys_sig_sep: # swap the directed and undirected edges separately permG = permute_network.permute_network( G, swap_phys_sig_sep=opts.swap_phys_sig_sep, num_iterations=opts.num_iterations, edge_lists=(undir_edges, dir_edges)) else: # if none of the options are specified, then swap everything as directed edges permG = permute_network.permute_network( G, num_iterations=opts.num_iterations) print("Writing %s" % (permuted_network_out_file)) nx.write_weighted_edgelist(permG, permuted_network_out_file, comments='#', delimiter='\t') else: print("Using %s" % (permuted_network_out_file)) # now run edgelinker on each of the chemicals using the permuted network # if version is netpath, use the different type of input file # TODO fix this # PATHLINKERDATAVERSIONS #if 'kegg' in opts.inputs_dir or 'netpath' in opts.inputs_dir: # rec_tfs_file_template = "%s/rec-tfs/%%s-nodes.txt" % (opts.inputs_dir) in_files = [] out_files = [] for chemical in tqdm(chemicals, disable=opts.verbose): rec_tfs_file = rec_tfs_file_template % (chemical) in_files.append(os.path.abspath(rec_tfs_file)) out_dir = "%s/%s" % (opts.out_dir, chemical) t_utils.checkDir(out_dir) out_pref = "%s/%d-random" % (out_dir, random_index) out_files.append(os.path.abspath(out_pref)) # python implementation of edgelinker is taking too long. Switching to java for now. #run_write_edgelinker(permG, rec_tfs_file, opts.k, out_pref) # run the java implementation of edgelinker below # write the in and out files to the networks dir edgelinker_in_files = '%s/networks/permuted-network%d-infiles.txt' % ( opts.out_dir, random_index) with open(edgelinker_in_files, 'w') as out: out.write('\n'.join(in_files)) edgelinker_out_files = '%s/networks/permuted-network%d-outfiles.txt' % ( opts.out_dir, random_index) with open(edgelinker_out_files, 'w') as out: out.write('\n'.join(out_files)) print("Running edgelinker on chemical %s: %s" % (chemical, out_pref)) run_edgelinker.runEdgeLinker(permuted_network_out_file, cyclinker_in_files, cyclinker_out_files, opts.k, edge_penalty=EDGE_PENALTY, rec_tfs_penalty=REC_TFS_PENALTY, multi_run=True) if opts.write_score_counts: # now that edgelinker has been run on all of the chemical sources/targets, # get the path counts for the chemical network's path scores # import compute_stat_sig.py and run the code directly. This avoids the issues of re-importing the libraries from baobab print( "Writing the counts for each of the scores for random index: '%d'" % (random_index)) stat_sig = compute_stat_sig.StatSig(random_paths_dir=opts.out_dir, k_limit=opts.k, num_random=(random_index, random_index), out_dir=opts.write_score_counts) stat_sig.write_rand_counts(chemicals=chemicals, forced=opts.forced) # cmd = "python src/compute_stat_sig.py " + \ # " --chemicals %s/chemicals.txt " % (opts.inputs_dir) + \ # " --random-paths-dir %s/ " % (opts.out_dir) + \ # " -P --k-limit %d " % (opts.k) + \ # " --num-random %d %d" % (random_index, random_index) + \ # " --group-by-prob " + \ # " --write-rand-counts " + \ # " --out-dir %s " % (opts.write_score_counts) # if opts.forced: # cmd += " --forced " # print(cmd) # subprocess.check_call(cmd.split()) #if opts.run_mgsa_random: # run_mgsa_random(random_index) if opts.cleanup: print( "Deleting the generated permuted network and the edgelinker output files" ) if permuted_network_out_file != opts.interactome: os.remove(permuted_network_out_file) os.remove(edgelinker_in_files) # remove the individual output files for cyc_out_file in out_files: # # 2017-02-17 - temporarilly don't remove the paths file for running MGSA os.remove(cyc_out_file + "-paths.txt") os.remove(cyc_out_file + "-ranked-edges.txt") os.remove(edgelinker_out_files)