def tcell_read_metabolomics_frames(): proc_tcell_t = cache.TsvFileTracker(os.path.join(cache.get_cache_path(), metabolite_expression_name + ".tsv.gz"), tcell_read_metabolomics_data) metabolomics_df = proc_tcell_t.read_file() values,cols = [],[] for coln in metabolomics_df.columns: if "non act" in coln: time = -1. elif "ON" in coln: time = 0. elif "act 3h" in coln: time = 3. elif "act 12h" in coln: time = 12. elif "act 14h" in coln: time = 14. elif "act 24h" in coln: time = 24. elif "act 2d" in coln: time = 48. elif "act 3d" in coln: time = 72. elif "act 4d" in coln: time = 96. else: print(coln) dish = coln.split('-')[0] rep = coln.split('-')[2].replace(" ","") values += [time] cols += ['_'.join([dish,str(int(time)),rep])] phenotype_df = pd.DataFrame(columns=cols, data=[values], index=["Time"]) metabolomics_df.columns = cols return phenotype_df, metabolomics_df
def generate_reactome_sunburst(values_df, sb_configuration = conf_human): base = cache.get_cache_path() json = generate_reactome_sunburst_json(values_df, sb_configuration) write_json(json, os.path.join(base, 'results.json')) resource_path = os.path.dirname(inspect.getsourcefile(run_server)) shutil.copy(os.path.join(resource_path,"index.html"), base) shutil.copy(os.path.join(resource_path,"breadcrumb.js"), base) run_server.run_sunburst(path=base)
def tcell_read_proteomics_data(): """This function is quite convoluted as it downloads an excelfile from a publication and extracts a dataframe. The function also caches intermediate files""" tcell_prot_xls = cache.UrlFileCache(os.path.join(cache.get_cache_path(), protein_expression_name + ".xlsx"),proteomics_data_url) proteomics_df = pd.read_excel(tcell_prot_xls.get_file_name(), sheet_name = "Data", index_col=0, usecols="A,D:U") # proteomics_df = pd.read_excel(tcell_prot_xls.get_file_name(), sheet_name = "Data", index_col=0, usecols="A,V:AM") proteomics_df = proteomics_df - proteomics_df.mean() # Normalize by subtracting column mean proteomics_df = proteomics_df.apply(np.exp2) # The excel data is in log2 space, return it to normal proteomics_df = proteomics_df.groupby("Protein IDs", group_keys=False).apply(one_row_per_proteoform).reset_index(drop=True) proteomics_df.set_index("ProteinID", inplace=True) return proteomics_df
def get_reactome_df(organism = "HSA", gene_anot = "Ensembl"): fn = gene_anot + reactome_fn path = os.path.join(cache.get_cache_path(),fn) url = reactome_url + fn reactome_df = pd.read_csv(cache.download_file(path, url), sep='\t', header=None, usecols=[0,1,3], names=["gene","reactome_id","reactome_name"]) organism = "R-" + organism reactome_df = reactome_df[reactome_df["reactome_id"].str.startswith(organism) ] return reactome_df
def tcell_read_metabolomics_data(): """This function is quite convoluted as it downloads an excelfile from a publication and extracts a dataframe, idexed by chebi. The function also caches intermediate files""" tcell_metabol_xls = cache.UrlFileCache(os.path.join(cache.get_cache_path(), metabolite_expression_name + ".xlsx"), metabolomics_data_url) metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0]) #metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0]) for col in metabolomics_df.columns: # Average all technical replicates (Named by trailing ".1") if len(col.split('.'))>1 and col.split('.')[1] == "1": remcol = col.split('.')[0] metabolomics_df[remcol] = scipy.stats.gmean(metabolomics_df[[remcol,col]],axis=1) metabolomics_df.drop(col, axis=1, inplace=True) metabolomics_df.index.name = "KEGG_ID" metabolomics_df = metabolomics_df.apply(np.exp2) # The excel data is in log2 space, return it to normal k = KEGG(verbose=False) map_kegg_chebi = k.conv("chebi", "compound") metabolomics_df = metabolomics_df.groupby("KEGG_ID", group_keys=False).apply(lambda x: one_row_per_compound_convert(x, map_kegg_chebi)).reset_index(drop=True) metabolomics_df.set_index("MetaboliteID", inplace=True) return metabolomics_df
def tcell_read_proteomics_frames(): proc_tcell_t = cache.TsvFileTracker(os.path.join(cache.get_cache_path(), protein_expression_name + ".tsv.gz"),tcell_read_proteomics_data) proteomics_df = proc_tcell_t.read_file() values,cols = [],[] for coln in proteomics_df.columns: if "notact" in coln: time = 0. elif "act12h" in coln: time = 12. elif "act24h" in coln: time = 24. elif "act48h" in coln: time = 48. elif "act72h" in coln: time = 72. elif "act96h" in coln: time = 96. else: print(coln) not_sure = coln.split('_')[0].replace("q","") rep = int(coln.split('_')[3].replace(" ","")) if rep<18: dish = int(rep/5)+2 rep = rep%5+1 elif rep<26: dish = rep%3+2 rep = 3 elif rep<31: dish = (rep-1)%3+2 rep = 3 else: dish = (rep-2)%3+2 rep = 3 values += [time] cols += ['_'.join([str(dish),str(int(time)),str(rep)])] proteomics_df.columns = cols phenotype_df = pd.DataFrame(columns=cols, data=[values], index=["Time"]) return phenotype_df, proteomics_df
def generate_reactome_tree(sb_configuration = get_conf_human()): """ Download the reactome pathway relation file and generate a tree structure """ # Derive a filename from the url fn = os.path.basename(urlparse(url_to_reactome_relation_file).path) path = os.path.join(cache.get_cache_path(),fn) return generate_root_node(cache.download_file(path, url_to_reactome_relation_file), sb_configuration)