def write_global_rec_tfs(self, rec_tfs_file, out_file): """ write the complete list of receptors and tfs from the intput file that are in the interactome """ print("\t Creating the '%s' file set of receptors and tfs from %s" % (out_file, rec_tfs_file)) if not self.net: self.net = self.build_network(self.interactome) # first get the list of all human rec and tfs receptors, tfs = t_utils.getRecTFs(rec_tfs_file) # all of the rec and tfs should be in the interactome AND rec have outgoing edges, tfs incoming edges # No protein should be both a rec and tf # remove receptors and tfs that aren't in the interactome receptors = set([ r for r in receptors if r in self.net and len(self.net.out_edges(r)) > 0 ]) tfs = set([ tf for tf in tfs if tf in self.net and len(self.net.in_edges(tf)) > 0 ]) # remove receptors and tfs that are in both receptors = set([r for r in receptors if r not in tfs]) tfs = set([tf for tf in tfs if tf not in receptors]) # now write the output file t_utils.checkDir(os.path.dirname(out_file)) t_utils.writeRecTFs(out_file, receptors, tfs)
def call_post_to_graphspace(version, chemicals, **kwargs): INPUTSPREFIX, RESULTSPREFIX, interactome = t_settings.set_version(version) t_utils.checkDir("%s/graphspace" % (RESULTSPREFIX)) # write the color files of each chemical and return a dictionary of the chemical and its color file #if kwargs['revigo_colors']: chemical_color_files = None if kwargs.get('revigo_file') or kwargs.get('term_counts_file'): chemical_color_files, function_colors = write_revigo_color_files( chemicals, RESULTSPREFIX, forced=kwargs['forcepost'], **kwargs) kwargs['function_colors'] = function_colors print(chemical_color_files) kwargs['tags'] = kwargs['tags'] + [version] if kwargs.get('tags') else [ version ] # post everything to graphspace! for chemical in chemicals: # get the chemical name. make sure it doesn't have any '%' or '[',']' as that will break posting chemName = chemIDtoName[chemical].replace('%', '').replace('[', '').replace( ']', '') rec_tfs_file = t_settings.REC_TFS_FILE % (INPUTSPREFIX, chemical) edgelinker_output_file = "%s/edgelinker/%s-paths.txt" % (RESULTSPREFIX, chemical) output_json = "%s/graphspace/%s-graph%s.json" % ( RESULTSPREFIX, chemical, kwargs.get('name_postfix', '')) proteins, num_paths = t_utils.getProteins(paths=edgelinker_output_file, max_k=kwargs['k_to_post'], ties=True) if not kwargs['forcepost'] and os.path.isfile(output_json): print("%s already exists. Use --forcepost to overwrite it" % (output_json)) else: build_graph_and_post( version, interactome, rec_tfs_file, RESULTSPREFIX, edgelinker_output_file, chemical, max_k=num_paths, graph_name="%s-%s-%s%s" % (chemName, chemical, version, kwargs.get('name_postfix', '')), #name_postfix='-'+version, tag=version, chemical_color_file=) graph_attr_file=chemical_color_files.get(chemical) if chemical_color_files is not None else None, ev_file=kwargs['evidence_file'], out_pref="%s/graphspace/%s%s" % (RESULTSPREFIX, chemical, kwargs.get('name_postfix', '')), **kwargs)
def write_assayed_rec_tfs(self, rec_tfs_files, out_file): """ write the complete list of receptors and tfs from the intput file that are in the interactome Should only be called after each chemical's list of rec and tfs have been written """ print( "\tCreating the '%s' file set of receptors and tfs from %d rec_tfs_files" % (out_file, len(rec_tfs_files))) if not self.net: self.net = self.build_network(self.interactome) receptors = set() tfs = set() for rec_tfs_file in rec_tfs_files: # first get all of the receptors and tfs rec, t = t_utils.getRecTFs(rec_tfs_file) receptors.update(set(rec)) tfs.update(set(t)) # all of the rec and tfs should be in the interactome # No protein should be both a rec and tf # remove receptors and tfs that aren't in the interactome orig_len_rec = len(receptors) orig_len_tfs = len(tfs) receptors = set([ r for r in receptors if r in self.net and len(self.net.out_edges(r)) > 0 ]) tfs = set([ tf for tf in tfs if tf in self.net and len(self.net.in_edges(tf)) > 0 ]) print("\tRemoved %d recptors and %d tfs not in the interactome" % (orig_len_rec - len(receptors), orig_len_tfs - len(tfs))) # remove receptors and tfs that are in both orig_len_rec = len(receptors) orig_len_tfs = len(tfs) receptors = set([r for r in receptors if r not in tfs]) tfs = set([tf for tf in tfs if tf not in receptors]) print( "\tRemoved %d recptors and %d tfs that were in both the receptors and tfs sets" % (orig_len_rec - len(receptors), orig_len_tfs - len(tfs))) # now write the output file t_utils.checkDir(os.path.dirname(out_file)) t_utils.writeRecTFs(out_file, receptors, tfs)
def write_all_rec_tfs(self, out_file): """ write the complete list of receptors and tfs perturbed by any chemical as well as the assay name """ print("\t Creating the '%s' file set of receptors and tfs" % (out_file)) if not self.net: self.net = self.build_network(self.interactome) t_utils.checkDir(os.path.dirname(out_file)) out = open(out_file, 'w') out.write("#uniprot_acc\tnode_type\tassay_name\n") for r in self.receptor_assays: for acc in self.assayNametoAccListHuman[r]: if acc in self.net: out.write(acc + '\t' + 'receptor' + '\t' + r + '\n') for tf in self.tf_assays: for acc in self.assayNametoAccListHuman[tf]: if acc in self.net: out.write(acc + '\t' + 'tf' + '\t' + tf + '\n') out.close()
def load_prots(chemicals, paths_dir, out_dir, k_limit=200, **kwargs): if kwargs['run_on_hits']: chem_prots = toxcast_data.chemical_protein_hit reports_dir = "%s/chemical-hits-reports/" % (out_dir) else: # one chemical and protein ID pair on each line prots_file = "%s/chem-prots.txt" % (out_dir) if not kwargs['forced'] and os.path.isfile(prots_file): print("reading %s. Use --forced to overwrite" % (prots_file)) s = pd.read_csv(prots_file, sep='\t', index_col=0, header=None, squeeze=True) # now convert it back to a dictionary chem_prots = { chem: prots.to_list() for chem, prots in s.groupby(s.index) } else: # load the proteins in each chemical's network edgelinker_output = paths_dir + '/%s-paths.txt' print("Reading paths for each chemical network from: %s" % (edgelinker_output)) chem_prots = {} for chemical in chemicals: proteins = t_utils.getProteins(paths=edgelinker_output % chemical, max_k=k_limit) chem_prots[chemical] = list(proteins) s = pd.Series(chem_prots).explode() print("writing %s" % (prots_file)) s.to_csv(prots_file, sep='\t', header=False) reports_dir = "%s/chemical-reports/" % (out_dir) t_utils.checkDir(os.path.dirname(reports_dir)) return chem_prots, reports_dir
# first plot the distribution of edge weights in the response networks # get all of the response network edges and their edge weight in the interactome cyclinker_file = "%s/cyclinker/%%s-paths.txt" % (t_settings.RESULTSPREFIX) for chemical in tqdm(chemicals): edges = t_utils.getEdges(paths_file=cyclinker_file % (chemical), max_k=200, ties=True) #tqdm.write("%d edges" % len(edges)) for edge in edges: network_weights.append(edge_weights[edge]) # now plot the distribution of edge weights #out_file_name = "response-network-weight-dist-dir-k500-%s.png" % (version) out_file_name = "response-network-weight-dist-k200-%s.png" % (version) out_dir = "%s/plots/edge-weights/" % (t_settings.RESULTSPREFIX) t_utils.checkDir(out_dir) out_file = "%s/%s" % (out_dir, out_file_name) if opts.compare_versions: out_dir_compare_versions = "viz/version_plots/edge-weights/" t_utils.checkDir(out_dir_compare_versions) out_file_compare_versions = "%s/%s" % (out_dir_compare_versions, out_file_name) print("Plotting response network edge weight histogram to %s" % (out_file)) fig, ax = plt.subplots() print("%s of edges with a weight > %s" % (len([w for w in network_weights if w > 0.95]) / float(len(network_weights)), 0.95)) ax.hist(network_weights, bins=30)
def main(chemicals, paths_dir, out_dir, k_limit=200, forced=False, pval_cutoff=0.05, corr_type="BF", **kwargs): t_utils.checkDir(out_dir) # one chemical and protein ID pair on each line prots_file = "%s/chem-prots.txt" % (out_dir) if kwargs['run_on_hits']: toxcast_data = t_utils.loadToxcastData() chem_prots = toxcast_data.chemical_protein_hit reports_dir = "%s/chemical-hits-reports/" % (out_dir) else: if not forced and os.path.isfile(prots_file): print("reading %s. Use --forced to overwrite" % (prots_file)) s = pd.read_csv(prots_file, sep='\t', index_col=0, header=None, squeeze=True) # now convert it back to a dictionary chem_prots = { chem: prots.to_list() for chem, prots in s.groupby(s.index) } else: # load the proteins in each chemical's network cyclinker_output = paths_dir + '/%s-paths.txt' print("Reading paths for each chemical network from: %s" % (cyclinker_output)) chem_prots = {} for chemical in chemicals: proteins = t_utils.getProteins(paths=cyclinker_output % chemical, max_k=k_limit) chem_prots[chemical] = list(proteins) s = pd.Series(chem_prots).explode() print("writing %s" % (prots_file)) s.to_csv(prots_file, sep='\t', header=False) reports_dir = "%s/chemical-reports/" % (out_dir) t_utils.checkDir(os.path.dirname(reports_dir)) # run the DAVID analysis on each of them client = None # reset the client each time. Maybe this isn't needed? for chem in tqdm(chemicals): chart_file = "%s/%s.txt" % (reports_dir, chem) if not forced and os.path.isfile(chart_file): print("%s already exists. Use --forced to overwrite" % (chart_file)) continue if client is None: print("Setting up david client") client = david_client.DAVIDClient() client.set_category('GOTERM_BP_DIRECT') print(chem) prots = chem_prots[chem] # pass the list of proteins client.setup_inputs(','.join(prots), idType='UNIPROT_ACCESSION', listName=chem) # make sure we're using the right list #print(client.client.service.getCurrentList()) # build the functional annotation chart client.build_functional_ann_chart() # and write each to a file #print("writing %s" % (chart_file)) client.write_functional_ann_chart(chart_file) pval_col = "Pvalue" if corr_type == "BF": pval_col = "Bonferroni" elif corr_type == "BH": pval_col = "Benjamini" # now read each of them and write a combined file dfs = [] for chem in chemicals: chart_file = "%s/%s.txt" % (reports_dir, chem) if not os.path.isfile(chart_file): print("%s doesn't exist. Skipping" % (chart_file)) continue df = pd.read_csv(chart_file, sep='\t') # apply the p-value cutoff df = df[df[pval_col] < pval_cutoff] df = df[['Term', pval_col]] # split the name and id df['GOID'] = df['Term'].apply(lambda x: x.split('~')[0]) df['Term'] = df['Term'].apply(lambda x: x.split('~')[1]) df['Chemical'] = chem print(len(df)) dfs.append(df) df_all = pd.concat(dfs) print(df_all.head()) all_terms_file = "%s/%schemical%s-sig-terms-%s-c%s.tsv" % ( out_dir, len(chemicals), "-hits" if kwargs['run_on_hits'] else "s", pval_col.lower(), str(pval_cutoff).replace('.', '_')) df_all.to_csv(all_terms_file, sep='\t', index=None, columns=['Chemical', 'Term', 'GOID', pval_col]) # now compare the overlap of the enriched terms! #df_all.groupby('Term').value_counts() counts = df_all[['Term', 'GOID']].value_counts() print(counts) counts_file = "%s/%schemicals-sig-terms-%s-c%s-counts.tsv" % ( out_dir, len(chemicals), pval_col.lower(), str(pval_cutoff).replace( '.', '_')) print("writing to %s" % (counts_file)) counts.to_csv(counts_file, header=False, sep='\t')
def write_chemical_perturbed_rec_tfs(self, chemicals_file, rec_tfs_dir, include_zscore_weight=False): """ write the chemicals file as well as the perturbed rec and tfs for each chemical We are writing a single file for each chemical so we can use each file for running pathlinker/cyclinker """ print( "\tWriting the chemicals file (%s) as well as the perturbed rec and tfs for each chemical in %s" % (chemicals_file, rec_tfs_dir)) t_utils.checkDir(rec_tfs_dir) # first write a general file with the hit rec and tf per chemical out_file = "%s/../chem_rec_tfs.gmt" % (rec_tfs_dir) self.write_chem_rec_tfs(out_file) # also write a table with the number of hit rec and tfs per chemical out_file = "%s/../chemical_num_rectfs.txt" % (rec_tfs_dir) self.write_chem_num_rectfs(out_file) # keep track of all of the receptors and tfs and write them to a file as well all_rec = set() all_tfs = set() chemicals = self.chemical_rec.keys() # first write the chemicals with open(chemicals_file, 'w') as out: out.write('\n'.join([ "%s\t%s" % (chemical, self.chemIDtoName[chemical]) for chemical in chemicals ])) if include_zscore_weight is True: zscores = [] for chem, prots in self.chemical_rec.items(): zscores += [ self.chemical_protein_zscore[chem][p] for p in prots ] for chem, prots in self.chemical_tfs.items(): zscores += [ self.chemical_protein_zscore[chem][p] for p in prots ] # use the maximum to normalize the zscores max_zscore = max(zscores) print("max zscore is: %0.2f" % max_zscore) for chem in tqdm(chemicals): # some of the self.chemicals have spaces in their names, so use the ID rather than the name. rec = set(self.chemical_rec[chem]) tfs = set(self.chemical_tfs[chem]) all_rec.update(rec) all_tfs.update(tfs) if include_zscore_weight is False: t_utils.writeRecTFs( "%s/%s-rec-tfs.txt" % (rec_tfs_dir, chemical), rec, tfs) else: # convert the zscore to a cost by taking 1 - (zscore / max zscore) # the lower the zscore, the higher the cost will be zscores = {} curr_zscores = self.chemical_protein_zscore[chem] for prots in (self.chemical_rec[chem], self.chemical_tfs[chem]): for p in prots: zscore = curr_zscores[p] zscores[p] = zscore if not pd.isnull(zscore) else 0 costs = { p: 1 - (zscore / float(max_zscore)) for p, zscore in zscores.items() } t_utils.writeRecTFs("%s/%s-rec-tfs.txt" % (rec_tfs_dir, chem), rec, tfs, costs=costs, zscores=zscores) out_file = "%s/all-rec-tfs.txt" % (rec_tfs_dir) print("Writing all of the assayed receptors and tfs to the file: %s" % (out_file)) t_utils.writeRecTFs(out_file, all_rec, all_tfs)
def __init__(self, include_nuclear_receptors=False, forced=False, verbose=False): # boolean value to either include or exclude nuclear receptors self.include_nuclear_receptors = include_nuclear_receptors # option to print(various parsing statistics) self.verbose = verbose self.forced = forced # inputs dir # 2019-08: Updating to use toxcast v3 data self.input_dir = "inputs/toxcast-tox21-v3" # date present in the file names of the files for this version self.version_date = "190708" # input files self.assay_file = "%s/Assay_Summary_%s.csv" % (self.input_dir, self.version_date) self.s2_assay_file = "%s/S2-ToxCast-Assays.tsv" % (self.input_dir) self.chemical_summary_file = "%s/Chemical_Summary_%s.csv" % ( self.input_dir, self.version_date) self.zscore_file = "%s/zscore_Matrix_%s.csv" % (self.input_dir, self.version_date) self.hitc_file = "%s/hitc_Matrix_%s.csv" % (self.input_dir, self.version_date) #self.chemical_types_file = "%s/chemical_types.tsv" % (self.input_dir) # output files self.parsed_dir = "%s/parsed" % (self.input_dir) t_utils.checkDir(self.parsed_dir) self.chem_rec_tfs_file = "%s/chem_rec_tfs.gmt" % (self.parsed_dir) self.chem_hits_file = "%s/chem_prot_hits.csv" % (self.parsed_dir) self.chem_zscore_file = "%s/chem_prot_zscores.csv" % (self.parsed_dir) # mapping dictionaries self.chemIDtoName = {} self.chemNametoID = {} self.chemIDtoTYPE = {} self.chemTYPEtoID = defaultdict(set) self.assayNametoAccHuman = {} self.assayAcctoNameHuman = defaultdict(set) self.assayNametoAccListHuman = {} self.assayNametoType = {} # assay type anaylsis self.assay_types = [] # key is the assay type. # each assay type has a list of the proteins perturbed for each chemical self.assay_type_hits = {} # key is chemical ID, assay results list (0,1,-1 or NA) is the value self.chemical_assay_hits = {} # key is chemical ID, list of z-scores is the value self.chemical_assay_zscores = {} # list of assays in the hits file self.hit_assays = [] # these next two are from the Assay_Summary or S2_Assay_Summary # key is assay, value is type_sub self.intended_target_type_sub = {} # key is assay, value is family self.intended_target_family = {} # receptor and tf assays # key is assay, value is acc self.receptor_assays = {} self.tf_assays = {} # the sets of hit receptors and TFs (from the Assay summary file)for each chemical self.chemical_rec = defaultdict(set) self.chemical_tfs = defaultdict(set) # proteins stored as uniprot accession IDs # prot: 0 or 1. Each protein is labelled a 'hit' if any of the assays are 'hit' self.chemical_protein_hit = defaultdict(dict) # prot: zscore. This is the largest zscore value of any of the hit (1) assays self.chemical_protein_zscore = defaultdict(dict)
def write_revigo_color_files(chemicals, RESULTSPREFIX, **kwargs): """ If a file downloaded from REVIGO is passed in, then use that to set the term colors and boxes. Otherwise, just remove the most frequent term. Hopefully there isn't too much overlap in the remaining terms. TODO Another possible strategy is to cluster the terms myself and select a single term per cluster. """ # assign a color to each term out_dir = "%s/graphspace/colors" % (RESULTSPREFIX) t_utils.checkDir(out_dir) #print("Writing REVIGO colors to %s for %d chemicals. (limit of %d colors)" % (out_dir, len(chemicals), len(colors))) chem_color_files = {} for chemical in chemicals: out_prefix = "%s/%s" % (out_dir, chemical) out_file = "%s-colors.tsv" % (out_prefix) # first read the david results file david_file = "%s/stats/go-analysis/chemical-reports/%s.txt" % ( RESULTSPREFIX, chemical) print("reading %s" % (david_file)) df = pd.read_csv(david_file, sep='\t') print(df.head()) # get just the term ids df['term'] = df['Term'].apply(lambda x: x[:x.find('~')]) df['name'] = df['Term'].apply(lambda x: x.split('~')[1]) # build a dictionary from the term to the prots orig_term_names = dict(zip(df['term'], df['name'])) term_prots = { t: prots.replace(', ', '|') for t, prots in zip(df['term'], df['Genes']) } term_pvals = dict(zip(df['term'], df['Bonferroni'])) term_names = dict(zip(df['term'], df['name'])) name_to_term = dict(zip(df['name'], df['term'])) # read the revigo file and extract the GO term info if kwargs.get('revigo_file'): if not os.path.isfile(kwargs['revigo_file']): print("ERROR: --revigo-file '%s' not found." % (kwargs['revigo_file'])) sys.exit() print("reading %s" % (kwargs['revigo_file'])) df_r = pd.read_csv(kwargs['revigo_file'], sep=',') print(df_r.head()) # sort by pval #df_r = df_r.sort_values("log10 p-value") term_names = dict(zip(df_r['term_ID'], df_r['description'])) selected_terms = list(term_names.keys()) elif kwargs.get('term_counts_file'): if not os.path.isfile(kwargs['term_counts_file']): print("ERROR: --term-counts-file '%s' not found." % (kwargs['term_counts_file'])) sys.exit() print("reading %s" % (kwargs['term_counts_file'])) df_r = pd.read_csv(kwargs['term_counts_file'], sep='\t', names=['term_name', 'count']) print(df_r.head()) freq_cutoff = kwargs.get('freq_cutoff', .75) print("applying a frequency cutoff of %s" % (freq_cutoff)) df_r['freq'] = df_r['count'] / df_r['count'].max() term_freq = dict(zip(df_r['term_name'], df_r['freq'])) # sort by pval df = df.sort_values("Bonferroni") # apply a cutoff of 0.01 df = df[df['Bonferroni'] < 0.01] selected_terms = [] for name in df['name']: if term_freq[name] < freq_cutoff: selected_terms.append(name_to_term[name]) term_popups = {} link_template = "<a style=\"color:blue\" href=\"https://www.ebi.ac.uk/QuickGO/GTerm?id=%s\" target=\"DB\">%s</a>" for term in selected_terms: term_link = link_template % (term, term) popup = "<b>QuickGO</b>: %s" % (term_link) popup += "<br><b>p-value</b>: %0.2e" % (float(term_pvals[term])) term_popups[term] = popup function_colors = write_colors_file(out_file, selected_terms, term_names, term_prots, term_popups) chem_color_files[chemical] = out_file new_func_colors = defaultdict(dict) for term in function_colors: new_func_colors[term]['prots'] = term_prots[term] new_func_colors[term]['color'] = function_colors[term] new_func_colors[term][ 'link'] = "https://www.ebi.ac.uk/QuickGO/GTerm?id=%s" % (term) new_func_colors[term]['name'] = orig_term_names[term] # if uid in pathway_colors[pathway]['prots']: # pathway_link = '<a style="color:%s" href="%s">%s</a>' % (pathway_colors[pathway]['color'], pathway_colors[pathway]['link'], pathway) return chem_color_files, new_func_colors
def main(chemicals, paths_dir, out_dir, pval_cutoff=0.05, corr_type="BF", **kwargs): global toxcast_data, uniprot_to_gene toxcast_data = t_utils.loadToxcastData() chemIDtoCAS, chemCAStoID = get_chemical_map(toxcast_data) uniprot_to_gene_df = pd.read_csv(kwargs['mapping_file'], sep='\t', header=None) uniprot_to_gene = dict(uniprot_to_gene_df.values) t_utils.checkDir(out_dir) chem_prots, reports_dir = load_prots(chemicals, paths_dir, out_dir, **kwargs) ctd_genes, ctd_chem_itxs = load_ctd_data(kwargs['ctd_file'], chemCAStoID) # To get the background set of genes for the hypergeometric test, # get the proteins that are both in CTD and in the interactome print("reading %s" % (kwargs['interactome'])) df = pd.read_csv(kwargs['interactome'], sep='\t', comment='#', header=None) ppi_prots = set(df[0]) | set(df[1]) # map both to the gene name space ppi_genes = set(uniprot_to_gene[p] for p in ppi_prots) print("\t%d interactome_genes" % (len(ppi_genes))) background_genes = ppi_genes & ctd_genes print("%d genes both in the interactome and in CTD" % (len(background_genes))) print( "limiting CTD phosphorylation interactions to those in the interactome" ) ctd_chem_itxs = {c: p & ppi_genes for c, p in ctd_chem_itxs.items()} # also add the other chemicals for c in chemicals: if c not in ctd_chem_itxs: ctd_chem_itxs[c] = set() chem_pval = {} chem_net_prots_with_ctd = {} # TODO try both making random subsets and the hypergeometric test pop_size = len(background_genes) #num_success_states_in_pop = len(set(p for c,p in ctd_itxs.items())) for chem, prots in chem_prots.items(): genes = set(uniprot_to_gene[p] for p in prots) #if len(genes) != len(prots): # print("Warning: %s: num genes != num prots! (%s, %s)" % (chem, len(genes), len(prots))) num_genes_with_ctd = len(genes & ctd_chem_itxs[chem]) chem_net_prots_with_ctd[chem] = num_genes_with_ctd # number of draws is the # genes in the network num_draws = len(genes) # number of successes is the # genes in the network with a CTD interaction num_successes = num_genes_with_ctd # number of success stats in the population is the number of phosphorylation interactions of this chemical num_success_states_in_pop = len(ctd_chem_itxs[chem]) M, n, N, k = pop_size, num_success_states_in_pop, num_draws, num_successes # Use k-1 since the survival function (sf) gives 1-cdf. The cdf at k gives the probability of drawing k or fewer. The sf at k is the probability of drawing k+1 or more # https://blog.alexlenail.me/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458 # https://github.com/scipy/scipy/issues/7837 pval = hypergeom.sf(k - 1, M, n, N) chem_pval[chem] = pval # now write to a file out_file = "%s/CTD-stat-sig.tsv" % (out_dir) print("writing %s" % (out_file)) with open(out_file, 'w') as out: header_line = '\t'.join([ "ChemID", "ChemName", "# net prots", "# CTD phospho prots", "# overlap", "pval", "BF corr-pval" ]) out.write(header_line + '\n') for chem, prots in chem_prots.items(): name = toxcast_data.chemIDtoName[chem] out.write('\t'.join( str(x) for x in [ chem, name, len(prots), len(ctd_chem_itxs[chem]), chem_net_prots_with_ctd[chem], chem_pval[chem], chem_pval[chem] * len(chemicals) ]) + '\n')
def get_summary_stats(version="2018_01-toxcast-d2d-p1_5-u1_25", summary_file="network_summaries.csv", scope="permute-dir-undir", forced=False): """ Function to aggregate summary statistics for every network returns a dataframe containing the counted metrics for each chemical """ TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[version]) #inputs_dir = "inputs/%s/" % (version) t_settings.set_version(version) inputs_dir = t_settings.INPUTSPREFIX outputs_dir = "outputs/%s/weighted" % (version) chemicals = utils.readItemList("%s/chemicals.txt" % (inputs_dir), 1) #hits_template = "%s/hit-prots/%%s-hit-prots.txt" % (inputs_dir) #nonhits_template = "%s/hit-prots/%%s-nonhit-prots.txt" % (inputs_dir) #rec_tfs_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (inputs_dir) chem_rec, chem_tfs = TOXCAST_DATA.chemical_rec, TOXCAST_DATA.chemical_tfs chem_prot_hit_vals = TOXCAST_DATA.chemical_protein_hit paths_dir = "%s/edgelinker" % (outputs_dir) paths_template = "%s/%%s-paths.txt" % (paths_dir) out_dir = "%s/stats/summary" % outputs_dir t_utils.checkDir(out_dir) summary_file = "%s/%s" % (out_dir, summary_file) if os.path.isfile(summary_file) and not forced: print( "Reading network summary stats from '%s'. Set forced to True to overwrite it." % (summary_file)) df = pd.read_csv(summary_file, index_col=0) else: print("Reading in the stats from the response networks in", paths_dir) chemical_names, chemical_name_to_id = t_utils.getChemicalNameMaps() chemical_names = { chemical: chemical_names[chemical] for chemical in chemicals } chemical_prots = {} chemical_num_paths = {} chemical_num_edges = {} chemical_avg_path_lengths = {} chemical_rec = {} chemical_tfs = {} chemical_net_rec = {} chemical_net_tfs = {} chemical_hits = {} chemical_nonhits = {} chemical_net_hits = {} chemical_net_nonhits = {} chemical_inter_hits = {} chemical_inter_nonhits = {} chemical_inter_net_hits = {} chemical_inter_net_nonhits = {} # also get the q-value for each chemical chemical_pvals = {} pvals_file = "%s/stats/stat-sig-%s/gpd-pval.txt" % (outputs_dir, scope) if os.path.isfile(pvals_file): with open(pvals_file, 'r') as file_handle: header = file_handle.readline().rstrip().split('\t') pval_col = header.index("200") + 1 chemical_pvals = { chem: pval for chem, pval in utils.readColumns(pvals_file, 1, pval_col) } chemical_qvals = {} qvals_file = "%s/stats/stat-sig-%s/bfcorr_pval_qval.txt" % ( outputs_dir, scope) if os.path.isfile(qvals_file): chemical_qvals = t_utils.getPvals(outputs_dir, scope, sig_cutoff_type="FDR") for chemical in tqdm(chemicals): #prots, paths = getProteins(paths=paths_template % chemical, max_k=200, ties=True) paths = t_utils.getPaths(paths_template % chemical, max_k=200, ties=True) prots = set() num_paths = len(paths) edges = set() path_lengths = [] for path in paths: path = path.split('|') # path length is the number of edges in a path path_lengths.append(len(path) - 1) prots = prots.union(set(path)) for i in range(len(path) - 1): edges.add((path[i], path[i + 1])) chemical_prots[chemical] = len(prots) chemical_num_paths[chemical] = len(paths) chemical_avg_path_lengths[chemical] = np.mean(path_lengths) chemical_num_edges[chemical] = len(edges) #rec, tfs = t_utils.getRecTFs(rec_tfs_template % chemical) rec, tfs = chem_rec[chemical], chem_tfs[chemical] chemical_rec[chemical] = len(rec) chemical_tfs[chemical] = len(tfs) chemical_net_rec[chemical] = len(prots.intersection(rec)) chemical_net_tfs[chemical] = len(prots.intersection(tfs)) # read the hits and nonhits for each chemical to calculate how many of them are in the network #hits = utils.readItemSet(hits_template % chemical, 1) #nonhits = utils.readItemSet(nonhits_template % chemical, 1) hits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \ if hit_val == 1]) nonhits = set([p for p, hit_val in chem_prot_hit_vals[chemical].items() \ if hit_val == 0]) chemical_hits[chemical] = len(hits) chemical_nonhits[chemical] = len(nonhits) chemical_net_hits[chemical] = len(hits.intersection(prots)) chemical_net_nonhits[chemical] = len(nonhits.intersection(prots)) # subtract the rec and tfs to get just the intermediate hits and nonhits chemical_inter_hits[chemical] = len(hits.difference( rec.union(tfs))) chemical_inter_nonhits[chemical] = len( nonhits.difference(rec.union(tfs))) chemical_inter_net_hits[chemical] = len( hits.intersection(prots).difference(rec.union(tfs))) chemical_inter_net_nonhits[chemical] = len( nonhits.intersection(prots).difference(rec.union(tfs))) # write these metrics to a file df = pd.DataFrame({ "name": chemical_names, "prots": chemical_prots, "num_paths": chemical_num_paths, "pvals": chemical_pvals, "qvals": chemical_qvals, "num_edges": chemical_num_edges, "avg_path_lengths": chemical_avg_path_lengths, "net_rec": chemical_net_rec, "net_tfs": chemical_net_tfs, "hit_rec": chemical_rec, "hit_tfs": chemical_tfs, "net_hits": chemical_net_hits, "net_nonhits": chemical_net_nonhits, 'hits': chemical_hits, 'nonhits': chemical_nonhits, "inter_net_hits": chemical_inter_net_hits, "inter_net_nonhits": chemical_inter_net_nonhits, "inter_hits": chemical_inter_hits, "inter_nonhits": chemical_inter_nonhits, }) print("Writing: ", summary_file) df.to_csv(summary_file, header=True, columns=[ 'name', 'prots', 'num_paths', 'num_edges', 'avg_path_lengths', 'hits', 'nonhits', 'net_hits', 'net_nonhits', 'hit_rec', 'hit_tfs', 'net_rec', 'net_tfs', 'inter_net_hits', 'inter_net_nonhits', 'inter_hits', 'inter_nonhits', 'pvals', 'qvals' ]) # change the index or chemical id to unicode (string) #df.index = df.index.map(unicode) return df
def splitRecTFsFamilyNodes(chemicals, version, interactome_file): """ """ # leave some nodes as family nodes as that's how they are in the toxcast data map_family_to_prot = { # FOS,JUN,FOSL1,FOSL2,JUNB,JUND,FOSB: FOS,JUN "P01100,P05412,P15407,P15408,P17275,P17535,P53539": ["P01100,P05412"], # FOS,JUN,SP1: FOS,JUN "P01100,P05412,P08047": ["P01100,P05412"], # FOS,JUN: FOS,JUN "P01100,P05412": ["P01100,P05412"], # TCF7,TCF7L1,TCF7L2,LEF1: TCF7,TCF7L1,TCF7L2,LEF1 "P36402,Q9HCS4,Q9NQB0,Q9UJU2": ["P36402,Q9HCS4,Q9NQB0,Q9UJU2"], # FOXO3,FOXO4,FOXO1: FOXO3,FOXO4,FOXO1 "O43524,P98177,Q12778": ["O43524,P98177,Q12778"], } rec_tfs_file = "inputs/%s/rec-tfs/%%s-rec-tfs.txt" % (version) interactomes_dir = "inputs/%s" % (version) t_utils.checkDir(interactomes_dir) new_interactome_file = "%s/%s-interactome.txt" % (interactomes_dir, version) # get the set of family nodes from the interactome print("Reading the interactome from %s" % (interactome_file)) lines = utils.readColumns(interactome_file, 1, 2, 3) family_nodes = set( [N for U, V, w in lines for N in (U, V) if len(N.split(',')) > 1]) print( "Splitting the source/target family nodes of all chemicals in the interactome and writing to %s" % (new_interactome_file)) # set of family nodes to split from all chemicals family_to_split = {} for chemical in tqdm(chemicals): rec, tfs = t_utils.getRecTFs(rec_tfs_file % (chemical)) for N in family_nodes: for n in rec.union(tfs): if n in N: if N not in family_to_split: family_to_split[N] = set() family_to_split[N].add(n) # leave some tfs as family nodes because that's how they're listed in toxcast family_to_split.update(map_family_to_prot) split_rec = set() split_tfs = set() new_interactome = [] all_new_edges = set() # it's a bit ad hoc because the weight of the family edge is the max of the individual edges, # and now we're setting the edge weight of the split edges to be the max of the individual edges and the family edge new_edge_weights = {} #new_edge_ev = {} # there could be multiple family edges contributing to a single edge for U, V, w in lines: new_edges = set() # split up the rec/tf family nodes if U in family_to_split and V in family_to_split: split_rec.add(U) split_tfs.add(V) for u in family_to_split[U]: for v in family_to_split[V]: new_edges.add((u, v)) elif U in family_to_split: split_rec.add(U) for u in family_to_split[U]: new_edges.add((u, V)) elif V in family_to_split: split_tfs.add(V) for v in family_to_split[V]: new_edges.add((U, v)) # otherwise leave the edge as it is else: new_interactome.append((U, V, w)) continue all_new_edges.update(new_edges) for (u, v) in new_edges: if (u, v) not in new_edge_weights: new_edge_weights[(u, v)] = set() new_edge_weights[(u, v)].add(float(w)) # for now, don't write the evidence to each of the new networks to save on space # the evidence is present in the original interactome and the evidence file #if (u,v) not in new_edge_ev: # new_edge_ev[(u,v)] = set() #new_edge_ev[(u,v)].update(set(ev.split('|'))) for u, v in all_new_edges: w = max(new_edge_weights[(u, v)]) #ev = '|'.join(new_edge_ev[(u,v)]) new_interactome.append((u, v, "%0.6f" % w)) # now write the new interactome print("Writing the new interactome with rec/tf family nodes split to %s" % (new_interactome_file)) with open(new_interactome_file, 'w') as out: out.write('\n'.join(['\t'.join(line) for line in new_interactome]) + '\n') # also write the family nodes that were split mapping = getUniprotToGeneMapping(version) # also write the mapping from the rec/tf family node to the proteins it came from out_file = "inputs/%s/family-split-rec-tfs.txt" % (version) print( "Writing a mapping of the split family rec/tfs and the protein hits they came from to: %s" % (out_file)) with open(out_file, 'w') as out: out.write('\n'.join([ "%s\t%s\t%s\t%s" % (N, '|'.join(family_to_split[N]), mapping[N], '|'.join([mapping[n] for n in family_to_split[N]])) for N in sorted(family_to_split) ]) + '\n') print("A total of %d family nodes were split" % (len(family_to_split))) # add the zscore penalty to the few family nodes in the ToxCast data toxcast_family_nodes = [N[0] for N in map_family_to_prot.values()] addRecTFsFamilyNodes(chemicals, version, family_nodes=toxcast_family_nodes, costs=True)
def permute_and_run_edgelinker(opts, random_index): if opts.write_score_counts: rand_scores_k = "%s/rand-networks/rand-%d-med-scores-k.txt" % ( opts.write_score_counts, random_index) # if the final score counts file already exists, then don't do anything if os.path.isfile(rand_scores_k) and not opts.forced: print("%s already exists. Skipping." % (rand_scores_k)) return chemical_k_scores = "%s/chemical-k-median-scores.txt" % ( opts.write_score_counts) if not os.path.isfile(chemical_k_scores): print( "Error: %s does not exist. Run compute_stat_sig.py with the --write-counts option to write it. Quitting" % (chemical_k_scores)) return t_utils.checkDir("%s/networks" % (opts.out_dir)) rec_tfs_file_template = "%s/rec-tfs/%%s-rec-tfs.txt" % (opts.inputs_dir) chemicals = sorted( utils.readItemList("%s/chemicals.txt" % opts.inputs_dir, col=1)) if opts.single_chem: chemicals = opts.single_chem if opts.permute_rec_tfs is not None: # if specified, "permute" the sets of receptors and tfs for each chemical instead of the interactome print("Writing random sets of rec/tfs for each chemical to %s" % (opts.out_dir)) rec_tfs_file_template = "%s/%%s/%d-random-rec-tfs.txt" % (opts.out_dir, random_index) all_rec, all_tfs = t_utils.getRecTFs(opts.permute_rec_tfs) #chemical_num_rectfs_file = "%s/chemical_num_rectfs.txt" % (opts.inputs_dir) #lines = utils.readColumns(chemical_num_rectfs_file, 2, 3, 4) #for chem, num_rec, num_tfs in tqdm(lines): for chemical in tqdm(chemicals, disable=opts.verbose): out_file = rec_tfs_file_template % (chemical) if not os.path.isfile(out_file) or opts.forced: rec, tfs, costs, zscores = t_utils.getRecTFs( t_settings.REC_TFS_FILE % (opts.inputs_dir, chemical), costs=True) rec = list(rec) tfs = list(tfs) out_dir = "%s/%s" % (opts.out_dir, chemical) t_utils.checkDir(out_dir) random_rec = random.sample(all_rec, len(rec)) # apply the costs to the random rec and tfs for i in range(len(rec)): costs[random_rec[i]] = costs[rec[i]] zscores[random_rec[i]] = zscores[rec[i]] random_tfs = random.sample(all_tfs, len(tfs)) for i in range(len(tfs)): costs[random_tfs[i]] = costs[tfs[i]] zscores[random_tfs[i]] = zscores[tfs[i]] t_utils.writeRecTFs(out_file, random_rec, random_tfs, costs=costs, zscores=zscores) # use the original interactome permuted_network_out_file = opts.interactome print("Using the original interactome %s" % (permuted_network_out_file)) else: # default is to permute the interactome permuted_network_out_file = '%s/networks/permuted-network%d.txt' % ( opts.out_dir, random_index) if not os.path.isfile(permuted_network_out_file) or opts.forced: # don't log transform. The weights will be log transformed by the edgelinker code #G = cycLinker.readNetwork(opts.interactome, weight=True, logtransform=False) # UPDATE: 2017-12-07: try using the direction of the edges from the fourth column of the interactome instead of splitting based on if the edge is bidirected or not G = nx.DiGraph() dir_edges = [] undir_edges = [] lines = utils.readColumns(opts.interactome, 1, 2, 3, 4) if len(lines) == 0: print( "ERROR: interactome should have 4 columns: a, b, w, and True/False for directed/undirected. Quitting" ) sys.exit() for u, v, w, directed in lines: G.add_edge(u, v, weight=float(w)) if directed.lower() in ["true", "t", "dir", 'directed']: dir_edges.append((u, v)) elif directed.lower() not in [ "false", 'f', 'undir', 'undirected' ]: print( "ERROR: Unknown directed edge type '%s'. 4th column should be T/F to indicdate directed/undirected" % (directed.lower())) print("Quitting.") sys.exit() elif u < v: undir_edges.append((u, v)) if opts.undirected: # swap all edges as undirected edges permG = permute_network.permute_network( G.to_undirected(), num_iterations=opts.num_iterations) permG = permG.to_directed() elif opts.split_by_weight: # split the edges into bins by weight and swap the directed and undirected edges separately # if specified by the user permG = permute_network.permute_network( G, swap_phys_sig_sep=opts.swap_phys_sig_sep, split_weight=opts.split_by_weight, num_iterations=opts.num_iterations) elif opts.swap_phys_sig_sep: # swap the directed and undirected edges separately permG = permute_network.permute_network( G, swap_phys_sig_sep=opts.swap_phys_sig_sep, num_iterations=opts.num_iterations, edge_lists=(undir_edges, dir_edges)) else: # if none of the options are specified, then swap everything as directed edges permG = permute_network.permute_network( G, num_iterations=opts.num_iterations) print("Writing %s" % (permuted_network_out_file)) nx.write_weighted_edgelist(permG, permuted_network_out_file, comments='#', delimiter='\t') else: print("Using %s" % (permuted_network_out_file)) # now run edgelinker on each of the chemicals using the permuted network # if version is netpath, use the different type of input file # TODO fix this # PATHLINKERDATAVERSIONS #if 'kegg' in opts.inputs_dir or 'netpath' in opts.inputs_dir: # rec_tfs_file_template = "%s/rec-tfs/%%s-nodes.txt" % (opts.inputs_dir) in_files = [] out_files = [] for chemical in tqdm(chemicals, disable=opts.verbose): rec_tfs_file = rec_tfs_file_template % (chemical) in_files.append(os.path.abspath(rec_tfs_file)) out_dir = "%s/%s" % (opts.out_dir, chemical) t_utils.checkDir(out_dir) out_pref = "%s/%d-random" % (out_dir, random_index) out_files.append(os.path.abspath(out_pref)) # python implementation of edgelinker is taking too long. Switching to java for now. #run_write_edgelinker(permG, rec_tfs_file, opts.k, out_pref) # run the java implementation of edgelinker below # write the in and out files to the networks dir edgelinker_in_files = '%s/networks/permuted-network%d-infiles.txt' % ( opts.out_dir, random_index) with open(edgelinker_in_files, 'w') as out: out.write('\n'.join(in_files)) edgelinker_out_files = '%s/networks/permuted-network%d-outfiles.txt' % ( opts.out_dir, random_index) with open(edgelinker_out_files, 'w') as out: out.write('\n'.join(out_files)) print("Running edgelinker on chemical %s: %s" % (chemical, out_pref)) run_edgelinker.runEdgeLinker(permuted_network_out_file, cyclinker_in_files, cyclinker_out_files, opts.k, edge_penalty=EDGE_PENALTY, rec_tfs_penalty=REC_TFS_PENALTY, multi_run=True) if opts.write_score_counts: # now that edgelinker has been run on all of the chemical sources/targets, # get the path counts for the chemical network's path scores # import compute_stat_sig.py and run the code directly. This avoids the issues of re-importing the libraries from baobab print( "Writing the counts for each of the scores for random index: '%d'" % (random_index)) stat_sig = compute_stat_sig.StatSig(random_paths_dir=opts.out_dir, k_limit=opts.k, num_random=(random_index, random_index), out_dir=opts.write_score_counts) stat_sig.write_rand_counts(chemicals=chemicals, forced=opts.forced) # cmd = "python src/compute_stat_sig.py " + \ # " --chemicals %s/chemicals.txt " % (opts.inputs_dir) + \ # " --random-paths-dir %s/ " % (opts.out_dir) + \ # " -P --k-limit %d " % (opts.k) + \ # " --num-random %d %d" % (random_index, random_index) + \ # " --group-by-prob " + \ # " --write-rand-counts " + \ # " --out-dir %s " % (opts.write_score_counts) # if opts.forced: # cmd += " --forced " # print(cmd) # subprocess.check_call(cmd.split()) #if opts.run_mgsa_random: # run_mgsa_random(random_index) if opts.cleanup: print( "Deleting the generated permuted network and the edgelinker output files" ) if permuted_network_out_file != opts.interactome: os.remove(permuted_network_out_file) os.remove(edgelinker_in_files) # remove the individual output files for cyc_out_file in out_files: # # 2017-02-17 - temporarilly don't remove the paths file for running MGSA os.remove(cyc_out_file + "-paths.txt") os.remove(cyc_out_file + "-ranked-edges.txt") os.remove(edgelinker_out_files)