def get_TCGA_cnv(self, input_list): samples = xena.cohort_samples(self.host, self.cohort, None) #### All Sample selection #### Create ID mapper pos_df = pd.read_csv('TCGAlib/dataset/position_info_hg38.csv', index_col=0) #### ID mapping id_map = self.id_mapper_to_entrez(pos_df, input_list) #### Get Ensembl ID from EntrezID id_map['chr'] = id_map['chr'].apply(lambda x:'chr'+x) id_map['start'].astype(int) id_map['end'].astype(int) #### Retreive values from Xena values = [xena.segmented_data_range(self.host, self.cnv_dataset, samples, id_map['chr'].loc[el], id_map['start'].loc[el], id_map['end'].loc[el]) for el in id_map.index.tolist()] cnv_arr = [] for i,item in enumerate(input_list): temp_df = pd.DataFrame(data=values[i]['rows']['value'], columns=[item], index=values[i]['rows']['sampleID']) if not temp_df.empty: ####There are duplicated samples in Xena, don't know the reason. temp_df = temp_df[~temp_df.index.duplicated(keep='first')].loc[samples] cnv_arr.append(temp_df) if len(input_list)==1: cnv_df = cnv_arr[0].astype(float) else: cnv_df = pd.concat(cnv_arr, axis=1).astype(float) return cnv_df
def tutorial_main(): ## TCGA Hub Data host = xena.PUBLIC_HUBS['tcgaHub'] excludeTypes = ['probeMap', 'probemap', 'genePredExt'] cohort_list = xena.all_cohorts(host, excludeTypes) cohort_symbol_list = [x.split('(')[1][:-1] for x in cohort_list] # print(cohort_symbol_list) cohort= 'TCGA Bile Duct Cancer (CHOL)' cohort_symbol = 'CHOL' # cohort = "GDC TCGA Bile Duct Cancer (CHOL)" # cohort_symbol = 'CHOL' samples = xena.cohort_samples(host, cohort, None) ## mode = 'all' , 'selected' mode = 'selected' genes = ['ITGB5'] rna_expressions_by_genes = get_TCGA_rna_expressions(host, samples, cohort_symbol,mode,genes) print(rna_expressions_by_genes[genes[0]]) # Overall Survival data os_phenotypes = get_TCGA_OS_phenotypes(host, samples, cohort_symbol) os_result = clean_samples_for_tumour(rna_expressions_by_genes[genes[0]], os_phenotypes) os_time = pd.to_numeric(os_result['os_time'],errors='coerce') os_event = pd.to_numeric(os_result['os_event'], errors='coerce') rna_expression = list(os_result['expression']) os_groups = get_TCGA_os_groups(os_result['expression'], os_group_type ='quartiles') print(os_groups) os_df = pd.DataFrame( { 'os_time' : os_time, 'os_group': os_groups, 'os_event': os_event, } ) os_df = make_df_for_kaplan(os_result, 'quartiles') # os_df = make_df_for_kaplan(os_result, 'median') print(os_df.head()) results = multivariate_logrank_test(os_df['os_time'],os_df['os_group'],os_df['os_event']) print (results) print(results.p_value) print(len(os_df))
def kaplan_meier_a_gene_all_TCGA(genes,grouping_mode, result_dir): ## TCGA Hub Data host = xena.PUBLIC_HUBS['tcgaHub'] excludeTypes = ['probeMap', 'probemap', 'genePredExt'] cohort_list = xena.all_cohorts(host, excludeTypes) cohort_list.remove('TCGA Pan-Cancer (PANCAN)') cohort_list.remove('TCGA Formalin Fixed Paraffin-Embedded Pilot Phase II (FPPP)') cohort_symbol_list = [x.split('(')[1][:-1] for x in cohort_list] # cohort = 'TCGA Bile Duct Cancer (CHOL)' # cohort_symbol = 'CHOL' tcga_os_kaplan_result_dict = dict() ################# ## Temp ################# # cohort_list = ['TCGA Bile Duct Cancer (CHOL)'] # cohort_symbol_list =['CHOL'] for cohort in cohort_list: cohort_symbol = cohort.split('(')[1][:-1] samples = xena.cohort_samples(host, cohort, None) # print(cohort, cohort_symbol) ## mode = 'all' , 'selected' mode = 'selected' rna_expressions_by_genes = get_TCGA_rna_expressions(host, samples, cohort_symbol, mode, genes) # Overall Survival data os_phenotypes = get_TCGA_OS_phenotypes(host, samples, cohort_symbol) # remove NaN samples, and select only tumour samples os_result = clean_samples_for_tumour(rna_expressions_by_genes[genes[0]], os_phenotypes) os_df = make_df_for_kaplan(os_result, grouping_mode) # print("sample:",len(os_df)) results = multivariate_logrank_test(os_df['os_time'], os_df['os_group'], os_df['os_event']) # print("p-value:",results.p_value) tcga_os_kaplan_result_dict[cohort] = results.p_value # for key,value in tcga_os_kaplan_result_dict.items(): # print(key, value) tcga_os_kaplan_result_df = pd.DataFrame.from_dict(tcga_os_kaplan_result_dict, orient='index') tcga_os_kaplan_result_df.to_csv("{}/{}_kaplan_meier_all_TCGA.tsv".format(result_dir, genes[0]), sep='\t')
def kaplan_meier_all_gene_a_TCGA(cohort, grouping_mode, result_dir): ## TCGA Hub Data host = xena.PUBLIC_HUBS['tcgaHub'] excludeTypes = ['probeMap', 'probemap', 'genePredExt'] # cohort_list = xena.all_cohorts(host, excludeTypes) cohort_symbol = cohort.split('(')[1][:-1] rna_seq_dataset = "TCGA.{}.sampleMap/HiSeqV2_PANCAN".format(cohort_symbol) all_genes = xena.dataset_field(host, rna_seq_dataset) ## all gene sets # for a_cohort in cohort_list: # if cohort_symbol in a_cohort: # cohort = a_cohort samples = xena.cohort_samples(host, cohort, None) mode = 'all' rna_expressions_by_genes = get_TCGA_rna_expressions(host, samples, cohort_symbol, mode) tcga_os_kaplan_result_dict = dict() for gene in all_genes: print("cohort , gene", cohort, gene) ## mode = 'all' , 'selected' mode = 'selected' # Overall Survival data os_phenotypes = get_TCGA_OS_phenotypes(host, samples, cohort_symbol) # remove NaN samples, and select only tumour samples os_result = clean_samples_for_tumour(rna_expressions_by_genes[gene], os_phenotypes) os_df = make_df_for_kaplan(os_result, grouping_mode) # print("sample:", len(os_df)) results = multivariate_logrank_test(os_df['os_time'], os_df['os_group'], os_df['os_event']) # print("p-value:", results.p_value) tcga_os_kaplan_result_dict[gene] = results.p_value for key, value in tcga_os_kaplan_result_dict.items(): print(key, value) tcga_os_kaplan_result_df = pd.DataFrame.from_dict(tcga_os_kaplan_result_dict, orient='index') tcga_os_kaplan_result_df.to_csv("{}/{}_kaplan_meier_all_genes.tsv".format(result_dir, cohort_symbol),sep='\t')
def get_TCGA_expr(self, input_list): samples = xena.cohort_samples(self.host, [self.cohort], None) #### All Sample selection #### Create ID mapper xena_transcript = xena.dataset_field(self.host, self.expr_dataset) #### All Transcripts xena_transcript_df = pd.DataFrame(data=xena_transcript, columns =['EnsemblID']) xena_transcript_df['EnsemblID_edit'] = xena_transcript_df['EnsemblID'].apply(lambda x : x.split('.')[0]) xena_transcript_df = xena_transcript_df.set_index('EnsemblID_edit') #### ID mapping & Get Ensembl ID from EntrezID id_map = self.id_mapper_to_entrez(xena_transcript_df, input_list) #### Retreive values from Xena values = xena.dataset_fetch(self.host, self.expr_dataset, samples, id_map['EnsemblID'].values.tolist()) # list of lists #xena_df = pd.DataFrame(data=values, index=input_list, columns=samples) xena_df = pd.DataFrame(data=values, index=input_list, columns=samples).T.astype(float) return xena_df
def get_TCGA_mut(self, input_list): samples = xena.cohort_samples(self.host, self.cohort, None) #### All Sample selection #### Create ID mapper pos_df = pd.read_csv('TCGAlib/dataset/position_info_hg38.csv', index_col=0) #### ID mapping id_map = self.id_mapper_to_entrez(pos_df, input_list) #### Get Ensembl ID from EntrezID id_map['chr'] = id_map['chr'].apply(lambda x:'chr'+x) id_map['start'].astype(int) id_map['end'].astype(int) #### Retreive values from Xena values = [xena.sparse_data_range(self.host, self.mut_dataset, samples, id_map['chr'].loc[el], id_map['start'].loc[el], id_map['end'].loc[el]) for el in id_map.index.tolist()] result_arr = [pd.DataFrame(data=[values[i]['rows']['amino-acid'], values[i]['rows']['effect']], columns=values[i]['rows']['sampleID'], index=[item+'_AAC', item+'_Effect']).T for i,item in enumerate(id_map.index.tolist())] #mut_df = pd.concat(result_arr, axis=1) return result_arr