def main(): parser = argparse.ArgumentParser() parser.add_argument('--cancerType', dest='type',\ help='Cancer type to be collected') parser.add_argument('--getData',dest='get', action='store_true',\ default=False,help='Set flag to get all data') opts = parser.parse_args() if opts.get: for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']: cptac.download(dataset=ds) if opts.type.lower() == 'brca': dat = cptac.Brca() elif opts.type.lower() == 'ccrcc': dat = cptac.Ccrcc() elif opts.type.lower() == 'coad': dat = cptac.Colon() elif opts.type.lower() == 'ovca': dat = cptac.Ovarian() elif opts.type.lower() == 'luad': dat = cptac.Luad() elif opts.type.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() df = dat.get_phosphoproteomics() pdf = dat.get_proteomics() # df.columns = [' '.join(col).strip() for col in df.columns.values] df.to_csv(path_or_buf="phos_file.tsv", sep='\t') pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
def compute_regression(input_cancer_type): if input_cancer_type == "CCRCC": cancer = cptac.Ccrcc() elif input_cancer_type == "Endometrial": cancer = cptac.Endometrial() elif input_cancer_type == "LUAD": cancer = cptac.Luad() elif input_cancer_type == "HNSCC": cancer = cptac.Hnscc() elif input_cancer_type == "LSCC": cancer = cptac.Lscc() elif input_cancer_type == "PDAC": cancer = cptac.Pdac() df = dc.get_prot_trans_df(cancer) results = df.groupby('Gene').apply(regression) reg_df = pd.DataFrame(list(results)) reg_df.index = results.index reg_df.reset_index(inplace=True) reg_df = reg_df.dropna() reg_df['interaction_FDR'] = ssm.fdrcorrection( reg_df['interaction_pval'])[1] reg_df['condition_FDR'] = ssm.fdrcorrection(reg_df['condition_pval'])[1] reg_df['intercept_FDR'] = ssm.fdrcorrection(reg_df['intercept_pval'])[1] reg_df['Cancer'] = [input_cancer_type] * len(reg_df) file_name = input_cancer_type + '_regressions.csv' reg_df.to_csv(file_name, index=False)
def load_cancers(include_pdac=False): ccrcc = cptac.Ccrcc() en = cptac.Endometrial() luad = cptac.Luad() hnscc = cptac.Hnscc() lscc = cptac.Lscc() cancers = [ccrcc, en, luad, hnscc, lscc] cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC'] if include_pdac: pdac = cptac.Pdac() cancers.append(pdac) cancer_names.append('PDAC') return cancers, cancer_names
def getDataForCancer(ctype): if ctype.lower() == 'brca': dat = cptac.Brca() elif ctype.lower() == 'ccrcc': dat = cptac.Ccrcc() elif ctype.lower() == 'coad': dat = cptac.Colon() elif ctype.lower() == 'ovca': dat = cptac.Ovarian() elif ctype.lower() == 'luad': dat = cptac.Luad() elif ctype.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() return dat
def __init__(self): cptac.download(dataset="endometrial", version='latest') # cptac.download(dataset="brca", version='latest') # cptac.download(dataset="gbm", version='latest') # cptac.download(dataset="hsncc", version='latest') # cptac.download(dataset="luad", version='latest') cptac.download(dataset="ovarian", version='latest') cptac.download(dataset="ccrcc", version='latest') cptac.download(dataset="colon", version='latest') self.en = cptac.Endometrial() # self.brca = cptac.Brca() # self.gbm = cptac.Gbm() # self.hsncc = cptac.Hnscc() # self.luad= cptac.Luad() self.ovarian = cptac.Ovarian() self.ccrcc = cptac.Ccrcc() self.colon = cptac.Colon() # self.datasets = list(self.en,self.brca,self.gbm,self.hsncc,self.luad,self.ovarian,self.ccrcc) self.datasets = list([self.en, self.ovarian, self.ccrcc, self.colon])
def cptacData(): ''' We need to collect and load CPTAC data ''' print("Loading cptac datasets") #we need to make sure all datasets are downloaded ##here are the cancers that are available without login information allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\ #'hnscc','gbm','lscc',\ 'endometrial'] print("Downloading cptac data") for ct in allcans: cptac.download(dataset=ct) #then we load them into a dictionary fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\ 'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\ #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\ 'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()} return fdict
def test_get_frequently_mutated_renal_01_cutoff(): rc = cptac.Ccrcc() print('Running get_frequently_mutated...') df = ut.get_frequently_mutated(rc, cutoff=0.01) dimensions = (1106, 4) headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut'] # test genes names test_coord_names = ((11, 0), (992, 0), (1080, 0)) test_vals_names = ('ABCC3', 'TTN', 'ZNF532') total_tumors = 110 # test no missense test_coord_ABCC3 = ((11, 1), (11, 2), (11, 3)) test_vals_ABCC3 = (2 / total_tumors, 0 / total_tumors, 2 / total_tumors) # test no truncation and close to cutoff test_coord_ZNF532 = ((1080, 1), (1080, 2), (1080, 3)) test_vals_ZNF532 = (2 / total_tumors, 2 / total_tumors, 0 / total_tumors) # test miss and trunc equal to unique_samples_mutated test_coord_NAV3 = ((611, 1), (611, 2), (611, 3)) test_vals_NAV3 = (7 / total_tumors, 5 / total_tumors, 2 / total_tumors) # check that silent mutations are not counted (TTN has many silent mutations) # and missense and trucation not equal to unique_samples_mutated test_coord_TTN = ((992, 1), (992, 2), (992, 3)) test_vals_TTN = (13 / total_tumors, 10 / total_tumors, 4 / total_tumors) # common test and highest count test_coord_VHL = ((1019, 1), (1019, 2), (1019, 3)) test_vals_VHL = (82 / total_tumors, 33 / total_tumors, 49 / total_tumors) test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_ABCC3, test_vals_ABCC3), (test_coord_ZNF532, test_vals_ZNF532), (test_coord_NAV3, test_vals_NAV3), (test_coord_TTN, test_vals_TTN), (test_coord_VHL, test_vals_VHL)] for coord, val in test_coord_vals: PASS = check_getter(df, dimensions, headers, coord, val) print_test_result(PASS)
def test_get_frequently_mutated_renal_default_cutoff(): rc = cptac.Ccrcc() print('Running get_frequently_mutated...') df = ut.get_frequently_mutated(rc) dimensions = (6, 4) headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut'] # test genes names test_coord_names = ((0, 0), (2, 0), (4, 0)) test_vals_names = ('BAP1', 'PBRM1', 'TTN') total_tumors = 110 # test miss and trunc equal to unique_samples_mutated test_coord_BAP1 = ((0, 1), (0, 2), (0, 3)) test_vals_BAP1 = (17 / total_tumors, 7 / total_tumors, 10 / total_tumors) # test high truncation, low missense count test_coord_PBRM1 = ((2, 1), (2, 2), (2, 3)) test_vals_PBRM1 = (44 / total_tumors, 8 / total_tumors, 37 / total_tumors) # check that silent mutations are not counted (TTN has many silent mutations) # and missense and trucation not equal to unique_samples_mutated test_coord_TTN = ((4, 1), (4, 2), (4, 3)) test_vals_TTN = (13 / total_tumors, 10 / total_tumors, 4 / total_tumors) # test close to cutoff test_coord_SETD2 = ((3, 1), (3, 2), (3, 3)) test_vals_SETD2 = (15 / total_tumors, 2 / total_tumors, 13 / total_tumors) # common test and highest count test_coord_VHL = ((5, 1), (5, 2), (5, 3)) test_vals_VHL = (82 / total_tumors, 33 / total_tumors, 49 / total_tumors) test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_BAP1, test_vals_BAP1), (test_coord_PBRM1, test_vals_PBRM1), (test_coord_TTN, test_vals_TTN), (test_coord_SETD2, test_vals_SETD2), (test_coord_VHL, test_vals_VHL)] for coord, vals in test_coord_vals: PASS = check_getter(df, dimensions, headers, coord, vals) print_test_result(PASS)
warnings.filterwarnings('ignore') currentdir = os.path.dirname( os.path.realpath('Make_Cancer_Delta_Corr_and_P_Value_Dataframe')) parentdir = os.path.dirname(currentdir) parentdir = os.path.dirname(parentdir) sys.path.append(parentdir) import Delta_Correlation as dc input_cancer_type = sys.argv[1] mutated_gene = sys.argv[2] input_permutation_number = int(sys.argv[3]) cutoff = 15 if input_cancer_type == "CCRCC": cancer = cptac.Ccrcc() elif input_cancer_type == "Endometrial": cancer = cptac.Endometrial() cutoff = 10 elif input_cancer_type == "LUAD": cancer = cptac.Luad() elif input_cancer_type == "HNSCC": cancer = cptac.Hnscc() elif input_cancer_type == "LSCC": cancer = cptac.Lscc() elif input_cancer_type == "PDAC": cancer = cptac.Pdac() mutation_df = cancer.get_somatic_mutation() mutation_df = mutation_df[mutation_df.Gene == mutated_gene] mutation_df = mutation_df[mutation_df.Mutation != 'Silent']
test_coord_1 = ((index_1, 1), (index_2, 1), (index_3, 1)) # C3N-01515 test_vals_1 = ('No_Mutation', 'No_Mutation', 'No_Mutation') # Test Del test_coord_2 = ((index_4, 1), (index_5, 1), (index_6, 1)) test_vals_2 = ('Deletion', 'Deletion', 'Deletion') # Test Amp test_coord_3 = ((index_7, 1), (index_8, 1), (index_9, 1)) test_vals_3 = ('Amplification', 'Amplification', 'Amplification') test_coord_vals = [(test_coord_1, test_vals_1), (test_coord_2, test_vals_2), (test_coord_3, test_vals_3)] for coord, vals in test_coord_vals: PASS = check_getter(df, dimensions, headers, coord, vals) print_test_result(PASS) k = cptac.Ccrcc() g = cptac.Gbm() h = cptac.Hnscc() print("\nRunning tests:\n") test_genotype_ccrcc_KRAS() test_genotype_gbm_KRAS() test_genotype_hnscc_KRAS() print("Version:", cptac.version())
import numpy as np import math import pandas as pd #import statistics # import parse_correlations_dataframe as get_corr import copy import csv # import get_correlations import cptac.utils as ut import warnings warnings.filterwarnings("ignore") input_cancer_type = sys.argv[1] if input_cancer_type == "ccrcc": ccrcc = cptac.Ccrcc() cancer_list = [ccrcc] elif input_cancer_type == "en": en = cptac.Endometrial() cancer_list = [en] elif input_cancer_type == "luad": luad = cptac.Luad() cancer_list = [luad] elif input_cancer_type == "hnscc": hnscc = cptac.Hnscc() cancer_list = [hnscc] elif input_cancer_type == "lscc": lscc = cptac.Lscc() cancer_list = [lscc] # brca = cptac.Brca() # ccrcc = cptac.Ccrcc()