示例#1
0
	def get_TCGA_cnv(self, input_list):

		samples = xena.cohort_samples(self.host, self.cohort, None) #### All Sample selection

		#### Create ID mapper
		pos_df = pd.read_csv('TCGAlib/dataset/position_info_hg38.csv', index_col=0)

		#### ID mapping
		id_map = self.id_mapper_to_entrez(pos_df, input_list)

		#### Get Ensembl ID from EntrezID
		id_map['chr'] = id_map['chr'].apply(lambda x:'chr'+x)
		id_map['start'].astype(int)
		id_map['end'].astype(int)

		#### Retreive values from Xena
		values = [xena.segmented_data_range(self.host, self.cnv_dataset, samples, id_map['chr'].loc[el], id_map['start'].loc[el], id_map['end'].loc[el]) for el in id_map.index.tolist()]
		cnv_arr = []
		for i,item in enumerate(input_list):
			temp_df = pd.DataFrame(data=values[i]['rows']['value'], columns=[item], index=values[i]['rows']['sampleID'])

			if not temp_df.empty:
				####There are duplicated samples in Xena, don't know the reason.
				temp_df = temp_df[~temp_df.index.duplicated(keep='first')].loc[samples]
				cnv_arr.append(temp_df)

		if len(input_list)==1:
			cnv_df = cnv_arr[0].astype(float)
		else:
			cnv_df = pd.concat(cnv_arr, axis=1).astype(float)

		return cnv_df
示例#2
0
def tutorial_main():
    ## TCGA Hub  Data

    host = xena.PUBLIC_HUBS['tcgaHub']
    excludeTypes = ['probeMap', 'probemap', 'genePredExt']

    cohort_list = xena.all_cohorts(host, excludeTypes)

    cohort_symbol_list = [x.split('(')[1][:-1] for x in cohort_list]
    # print(cohort_symbol_list)

    cohort= 'TCGA Bile Duct Cancer (CHOL)'
    cohort_symbol = 'CHOL'

    # cohort = "GDC TCGA Bile Duct Cancer (CHOL)"
    # cohort_symbol = 'CHOL'

    samples = xena.cohort_samples(host, cohort, None)

    ## mode = 'all' , 'selected'
    mode = 'selected'
    genes = ['ITGB5']

    rna_expressions_by_genes = get_TCGA_rna_expressions(host, samples, cohort_symbol,mode,genes)

    print(rna_expressions_by_genes[genes[0]])

    # Overall Survival data
    os_phenotypes = get_TCGA_OS_phenotypes(host, samples, cohort_symbol)
    os_result = clean_samples_for_tumour(rna_expressions_by_genes[genes[0]], os_phenotypes)

    os_time = pd.to_numeric(os_result['os_time'],errors='coerce')
    os_event = pd.to_numeric(os_result['os_event'], errors='coerce')

    rna_expression = list(os_result['expression'])

    os_groups = get_TCGA_os_groups(os_result['expression'], os_group_type ='quartiles')


    print(os_groups)
    os_df = pd.DataFrame(
        {
            'os_time' : os_time,
            'os_group': os_groups,
            'os_event': os_event,
        }
    )

    os_df = make_df_for_kaplan(os_result, 'quartiles')
    # os_df = make_df_for_kaplan(os_result, 'median')

    print(os_df.head())
    results = multivariate_logrank_test(os_df['os_time'],os_df['os_group'],os_df['os_event'])
    print (results)
    print(results.p_value)
    print(len(os_df))
示例#3
0
def kaplan_meier_a_gene_all_TCGA(genes,grouping_mode, result_dir):
    ## TCGA Hub  Data

    host = xena.PUBLIC_HUBS['tcgaHub']
    excludeTypes = ['probeMap', 'probemap', 'genePredExt']

    cohort_list = xena.all_cohorts(host, excludeTypes)

    cohort_list.remove('TCGA Pan-Cancer (PANCAN)')
    cohort_list.remove('TCGA Formalin Fixed Paraffin-Embedded Pilot Phase II (FPPP)')
    cohort_symbol_list = [x.split('(')[1][:-1] for x in cohort_list]

    # cohort = 'TCGA Bile Duct Cancer (CHOL)'
    # cohort_symbol = 'CHOL'

    tcga_os_kaplan_result_dict = dict()
    #################
    ## Temp
    #################
    # cohort_list = ['TCGA Bile Duct Cancer (CHOL)']
    # cohort_symbol_list =['CHOL']


    for cohort in cohort_list:
        cohort_symbol = cohort.split('(')[1][:-1]
        samples = xena.cohort_samples(host, cohort, None)

        # print(cohort, cohort_symbol)

        ## mode = 'all' , 'selected'
        mode = 'selected'

        rna_expressions_by_genes = get_TCGA_rna_expressions(host, samples, cohort_symbol, mode, genes)

        # Overall Survival data
        os_phenotypes = get_TCGA_OS_phenotypes(host, samples, cohort_symbol)

        # remove NaN samples, and select only tumour samples
        os_result = clean_samples_for_tumour(rna_expressions_by_genes[genes[0]], os_phenotypes)

        os_df = make_df_for_kaplan(os_result, grouping_mode)

        # print("sample:",len(os_df))
        results = multivariate_logrank_test(os_df['os_time'], os_df['os_group'], os_df['os_event'])
        # print("p-value:",results.p_value)
        tcga_os_kaplan_result_dict[cohort] = results.p_value

    # for key,value in tcga_os_kaplan_result_dict.items():
    #     print(key, value)

    tcga_os_kaplan_result_df = pd.DataFrame.from_dict(tcga_os_kaplan_result_dict, orient='index')

    tcga_os_kaplan_result_df.to_csv("{}/{}_kaplan_meier_all_TCGA.tsv".format(result_dir, genes[0]),
                                        sep='\t')
示例#4
0
def kaplan_meier_all_gene_a_TCGA(cohort, grouping_mode, result_dir):
    ## TCGA Hub  Data

    host = xena.PUBLIC_HUBS['tcgaHub']
    excludeTypes = ['probeMap', 'probemap', 'genePredExt']

    # cohort_list = xena.all_cohorts(host, excludeTypes)
    cohort_symbol = cohort.split('(')[1][:-1]

    rna_seq_dataset = "TCGA.{}.sampleMap/HiSeqV2_PANCAN".format(cohort_symbol)

    all_genes  = xena.dataset_field(host, rna_seq_dataset)  ## all gene sets



    # for a_cohort in cohort_list:
    #     if cohort_symbol in a_cohort:
    #         cohort = a_cohort

    samples = xena.cohort_samples(host, cohort, None)

    mode = 'all'
    rna_expressions_by_genes = get_TCGA_rna_expressions(host, samples, cohort_symbol, mode)

    tcga_os_kaplan_result_dict = dict()

    for gene in  all_genes:
        print("cohort , gene", cohort, gene)
        ## mode = 'all' , 'selected'
        mode = 'selected'

        # Overall Survival data
        os_phenotypes = get_TCGA_OS_phenotypes(host, samples, cohort_symbol)

        # remove NaN samples, and select only tumour samples
        os_result = clean_samples_for_tumour(rna_expressions_by_genes[gene], os_phenotypes)

        os_df = make_df_for_kaplan(os_result, grouping_mode)

        # print("sample:", len(os_df))
        results = multivariate_logrank_test(os_df['os_time'], os_df['os_group'], os_df['os_event'])
        # print("p-value:", results.p_value)
        tcga_os_kaplan_result_dict[gene] = results.p_value


    for key, value in tcga_os_kaplan_result_dict.items():
        print(key, value)

    tcga_os_kaplan_result_df = pd.DataFrame.from_dict(tcga_os_kaplan_result_dict, orient='index')

    tcga_os_kaplan_result_df.to_csv("{}/{}_kaplan_meier_all_genes.tsv".format(result_dir, cohort_symbol),sep='\t')
示例#5
0
	def get_TCGA_expr(self, input_list):
		samples = xena.cohort_samples(self.host, [self.cohort], None) #### All Sample selection

		#### Create ID mapper
		xena_transcript = xena.dataset_field(self.host, self.expr_dataset) #### All Transcripts
		xena_transcript_df = pd.DataFrame(data=xena_transcript, columns =['EnsemblID'])
		xena_transcript_df['EnsemblID_edit'] = xena_transcript_df['EnsemblID'].apply(lambda x : x.split('.')[0])
		xena_transcript_df = xena_transcript_df.set_index('EnsemblID_edit')

		#### ID mapping & Get Ensembl ID from EntrezID
		id_map = self.id_mapper_to_entrez(xena_transcript_df, input_list)

		#### Retreive values from Xena
		values = xena.dataset_fetch(self.host, self.expr_dataset, samples, id_map['EnsemblID'].values.tolist()) # list of lists
		#xena_df = pd.DataFrame(data=values, index=input_list, columns=samples)

		xena_df = pd.DataFrame(data=values, index=input_list, columns=samples).T.astype(float)
		return xena_df
示例#6
0
	def get_TCGA_mut(self, input_list):

		samples = xena.cohort_samples(self.host, self.cohort, None) #### All Sample selection

		#### Create ID mapper
		pos_df = pd.read_csv('TCGAlib/dataset/position_info_hg38.csv', index_col=0)

		#### ID mapping
		id_map = self.id_mapper_to_entrez(pos_df, input_list)

		#### Get Ensembl ID from EntrezID
		id_map['chr'] = id_map['chr'].apply(lambda x:'chr'+x)
		id_map['start'].astype(int)
		id_map['end'].astype(int)

		#### Retreive values from Xena
		values = [xena.sparse_data_range(self.host, self.mut_dataset, samples, id_map['chr'].loc[el], id_map['start'].loc[el], id_map['end'].loc[el]) for el in id_map.index.tolist()]
		result_arr = [pd.DataFrame(data=[values[i]['rows']['amino-acid'], values[i]['rows']['effect']], columns=values[i]['rows']['sampleID'], index=[item+'_AAC', item+'_Effect']).T for i,item in enumerate(id_map.index.tolist())]
		#mut_df = pd.concat(result_arr, axis=1)

		return result_arr