def main(argv): """Runs the experiment.""" syn = synapseclient.Synapse() syn.login("grzadkow") cdata = VariantCohort(syn, 'TCGA-OV', mut_genes=['TTN'], mut_levels=('Gene', 'Form', 'Exon'), cv_info={ 'Prop': 0.8, 'Seed': argv[-1] }) cdata.train_expr_ = cdata.train_expr_.sort_index() prot_data = pd.read_csv(in_path + 'PNNL-causality-formatted.txt.zip', sep='\t') prot_vec = prot_data.ix[prot_data['ID'] == 'TTN', :] prot_vec = prot_vec.loc[:, prot_vec.columns.isin(cdata.train_expr_.index)] prot_vec = prot_vec.dropna(axis=1) use_indx = cdata.train_expr_.index.isin(prot_vec.columns) base_cor = spearmanr( np.array(prot_vec)[0], np.array(cdata.train_expr_.ix[prot_vec.columns, 'TTN'])) mtypes = [ MuType({('Gene', 'TTN'): { ('Form', 'Missense_Mutation'): None }}), MuType({('Gene', 'TTN'): { ('Form', 'Nonsense_Mutation'): None }}), ] mut_list = [ cdata.train_mut_.status(cdata.train_expr_.index, mtype) for mtype in mtypes ] clf = MKBMTL(path_keys={(((), ('controls-state-change-of', )), )}) clf.named_steps['fit'].R = 5 clf.fit_coh(cohort=cdata, mtypes=mtypes) H_cor = [ spearmanr(clf.named_steps['fit'].H_mat['mu'][i, use_indx], np.array(prot_vec)[0]) for i in range(clf.named_steps['fit'].R) ] print(clf.named_steps['fit'].bw_mat['mu'].round(2)) print(clf.eval_coh(cohort=cdata, mtypes=mtypes)) # saves classifier results to file out_file = out_path + argv[0] + '_' + argv[1] + '__run' + argv[-1] + '.p' print(out_file) out_data = {'H_cor': H_cor, 'base': base_cor} pickle.dump(out_data, open(out_file, 'wb'))
def main(argv): """Runs the experiment.""" # gets the directory where output will be saved and the name of the TCGA # cohort under consideration, loads the list of gene sub-variants print(argv) out_dir = os.path.join(base_dir, 'output', argv[0], argv[1], argv[2]) coh_lbl = 'TCGA-{}'.format(argv[0]) # loads the expression data and gene mutation data for the given TCGA # cohort, with the training/testing cohort split defined by the # cross-validation id for this task syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = VariantCohort(cohort=coh_lbl, mut_genes=[argv[1]], mut_levels=('Gene', 'Form', 'Exon', 'Location', 'Protein'), syn=syn, cv_seed=(int(argv[3]) + 3) * 17) base_mtype = MuType({('Gene', argv[1]): None}) optim = PartitionOptim(cdata, base_mtype, eval(argv[2]), ('Form', 'Exon', 'Location', 'Protein')) while optim.traverse_branch(): optim_mtypes = optim.best_optim() # saves classifier results to file out_file = os.path.join(out_dir, 'results', 'out__cv-{}.p'.format(argv[3])) pickle.dump( { 'best': optim.best_mtypes, 'hist': optim.mtype_scores, 'pred': optim.pred_scores, 'optim': optim.best_optim() }, open(out_file, 'wb'))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description=("Test a classifier's ability to predict the presence " "of a list of sub-types.")) # positional command line arguments parser.add_argument('mtype_dir', type=str, help='the folder where sub-types are stored') parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') parser.add_argument('base_gene', type=str, help='the gene to cross with respect to') parser.add_argument('cv_id', type=int, help='a random seed used for cross-validation') parser.add_argument('task_id', type=int, help='the subset of sub-types to assign to this task') parser.add_argument( '--tune_splits', type=int, default=8, help='how many training cohort splits to use for tuning') parser.add_argument( '--test_count', type=int, default=24, help='how many hyper-parameter values to test in each tuning split') parser.add_argument( '--parallel_jobs', type=int, default=12, help='how many parallel CPUs to allocate the tuning tests across') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') args = parser.parse_args() if args.verbose: print("Starting testing for directory\n{}\nwith " "cross-validation ID {} and task ID {} ...".format( args.mtype_dir, args.cv_id, args.task_id)) mtype_list = sorted( pickle.load( open(os.path.join(args.mtype_dir, 'tmp', 'mtype_list.p'), 'rb'))) # loads the pipeline used for classifying variants, gets the mutated # genes for each variant under consideration mut_clf = eval(args.classif) use_genes = reduce( or_, [set(gn for gn, _ in mtype.subtype_list()) for mtype in mtype_list]) | {args.base_gene} syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = VariantCohort(cohort=args.cohort, mut_genes=list(use_genes), mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'], expr_source='Firehose', data_dir=firehose_dir, syn=syn, cv_seed=(args.cv_id + 53) * 7, cv_prop=2 / 3) base_mtype = MuType({('Gene', args.base_gene): None}) base_train_samps = base_mtype.get_samples(cdata.train_mut) base_test_samps = base_mtype.get_samples(cdata.test_mut) if args.verbose: print("Loaded {} sub-types over {} genes which will be tested using " "classifier {} in cohort {} with {} samples.".format( len(mtype_list), len(use_genes), args.classif, args.cohort, len(cdata.samples))) out_acc = {mtype: {} for mtype in mtype_list} for i, mtype in enumerate(mtype_list): if (i % 10) == args.task_id: if args.verbose: print("Testing {} ...".format(mtype)) ex_genes = set(gn for gn, _ in mtype.subtype_list()) clf = mut_clf() cur_train_samps = mtype.get_samples(cdata.train_mut) cur_test_samps = mtype.get_samples(cdata.test_mut) clf.tune_coh(cdata, mtype, exclude_genes=ex_genes, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) clf.fit_coh(cdata, mtype, exclude_genes=ex_genes) out_acc[mtype]['Base'] = clf.eval_coh(cdata, mtype, exclude_genes=ex_genes) if (len(cur_train_samps - base_train_samps) > 3 and len(cur_test_samps - base_test_samps) > 3): print("Null test {}".format(mtype)) clf.tune_coh(cdata, mtype, exclude_genes=ex_genes, tune_splits=args.tune_splits, exclude_samps=base_train_samps, test_count=args.test_count, parallel_jobs=args.parallel_jobs) clf.fit_coh(cdata, mtype, exclude_genes=ex_genes, exclude_samps=base_train_samps) out_acc[mtype]['Null'] = clf.eval_coh( cdata, mtype, exclude_genes=ex_genes, exclude_samps=base_test_samps) if (len(cur_train_samps & base_train_samps) > 3 and len(cur_test_samps & base_test_samps) > 3): print("Mut test {}".format(mtype)) clf.tune_coh(cdata, mtype, exclude_genes=ex_genes, tune_splits=args.tune_splits, include_samps=base_train_samps, test_count=args.test_count, parallel_jobs=args.parallel_jobs) clf.fit_coh(cdata, mtype, exclude_genes=ex_genes, include_samps=base_train_samps) out_acc[mtype]['Mut'] = clf.eval_coh( cdata, mtype, exclude_genes=ex_genes, include_samps=base_test_samps) if (len(cur_train_samps - base_train_samps) > 3 and len(cur_test_samps & base_test_samps) > 3): print("Null cross {}".format(mtype)) clf.tune_coh(cdata, mtype, exclude_genes=ex_genes, tune_splits=args.tune_splits, exclude_samps=base_train_samps, test_count=args.test_count, parallel_jobs=args.parallel_jobs) clf.fit_coh(cdata, mtype, exclude_genes=ex_genes, exclude_samps=base_train_samps) out_acc[mtype]['NullX'] = clf.eval_coh( cdata, mtype, exclude_genes=ex_genes, include_samps=base_test_samps) if (len(cur_train_samps & base_train_samps) > 3 and len(cur_test_samps - base_test_samps) > 3): print("Mut cross {}".format(mtype)) clf.tune_coh(cdata, mtype, exclude_genes=ex_genes, tune_splits=args.tune_splits, include_samps=base_train_samps, test_count=args.test_count, parallel_jobs=args.parallel_jobs) clf.fit_coh(cdata, mtype, exclude_genes=ex_genes, include_samps=base_train_samps) out_acc[mtype]['MutX'] = clf.eval_coh( cdata, mtype, exclude_genes=ex_genes, exclude_samps=base_test_samps) else: del (out_acc[mtype]) # saves the performance measurements for each variant to file out_file = os.path.join( args.mtype_dir, 'results', 'out__cv-{}_task-{}.p'.format(args.cv_id, args.task_id)) pickle.dump( { 'Acc': out_acc, 'Info': { 'TuneSplits': args.tune_splits, 'TestCount': args.test_count, 'ParallelJobs': args.parallel_jobs } }, open(out_file, 'wb'))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description='Set up touring for sub-types to detect.' ) parser.add_argument('cohort', type=str, help="which TCGA cohort to use") # optional command line arguments controlling the thresholds for which # individual mutations and how many genes' mutations are considered parser.add_argument('--freq_cutoff', type=float, default=0.02, help='subtype sample frequency threshold') # optional command line arguments for what kinds of mutation sub-types to # look for in terms of properties and number of mutations to combine parser.add_argument('--mut_levels', type=str, default='Gene', help='the mutation property levels to consider') # optional command line argument controlling verbosity parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse the command line arguments, get the directory where found sub-types # will be saved for future use args = parser.parse_args() out_path = os.path.join(base_dir, 'setup', args.cohort) os.makedirs(out_path, exist_ok=True) use_lvls = args.mut_levels.split('__') # log into Synapse using locally-stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort( cohort=args.cohort, mut_genes=None, mut_levels=use_lvls, expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, cv_prop=1.0, samp_cutoff=args.freq_cutoff, syn=syn ) if args.verbose: print("Found {} candidate genes with mutations in at least " "{:.1f}% of the samples in TCGA cohort {}.\nLooking for " "subtypes of these genes that are combinations of up to two " "mutations at annotation levels {} ...\n".format( len(tuple(cdata.train_mut)), args.freq_cutoff * 100, args.cohort, use_lvls ) ) min_samps = args.freq_cutoff * len(cdata.samples) if use_lvls == ['Gene']: use_mtypes = {MuType({('Gene', gn): None}) for gn, mut in cdata.train_mut if len(mut) >= min_samps} elif use_lvls[0] == 'Gene': use_lvls = use_lvls[1:] use_mtypes = set() use_sampsets = set() mtype_sampsets = dict() for gn, mut in cdata.train_mut: cur_mtypes = { MuType({('Gene', gn): mtype}) for mtype in mut.combtypes(comb_sizes=(1, 2), sub_levels=use_lvls, min_type_size=min_samps) } # finds the samples belonging to each enumerated sub-type that # hasn't already been found cur_sampsets = { mtype: frozenset(mtype.get_samples(cdata.train_mut)) for mtype in cur_mtypes - use_mtypes} # removes the sub-types with so many mutated samples that there # are not enough negatively-labelled samples for classification mtype_sampsets.update({ mtype: sampset for mtype, sampset in cur_sampsets.items() if len(sampset) <= (len(cdata.samples) - min_samps) }) # ensures that when two sub-types have the same samples the one # further down the sort order gets removed sub_mtypes = sorted(list(mtype_sampsets)) if args.verbose: print("Found {} new sub-types!\n".format(len(sub_mtypes))) for i, mtype in enumerate(sub_mtypes): if args.verbose and (i % 200) == 100: print("\nchecked {} sub-types\n".format(i)) # ...we remove each one whose set of mutated samples is # identical to that of a sub-type that was already found if mtype_sampsets[mtype] in use_sampsets: if args.verbose: print("Removing functionally duplicate MuType {}"\ .format(mtype)) else: use_mtypes.update({mtype}) use_sampsets.update({mtype_sampsets[mtype]}) else: cur_mtypes = cdata.train_mut.combtypes(comb_sizes=(1, 2), sub_levels=use_lvls, min_type_size=min_samps) use_mtypes = set() use_sampsets = set() mtype_sampsets = dict() cur_sampsets = {mtype: frozenset(mtype.get_samples(cdata.train_mut)) for mtype in cur_mtypes - use_mtypes} # removes the sub-types with so many mutated samples that there # are not enough negatively-labelled samples for classification mtype_sampsets.update({ mtype: sampset for mtype, sampset in cur_sampsets.items() if len(sampset) <= (len(cdata.samples) - min_samps) }) # ensures that when two sub-types have the same samples the one # further down the sort order gets removed sub_mtypes = sorted(list(mtype_sampsets)) if args.verbose: print("Found {} new sub-types!\n".format(len(sub_mtypes))) for i, mtype in enumerate(sub_mtypes): if args.verbose and (i % 200) == 100: print("\nchecked {} sub-types\n".format(i)) # ...we remove each one whose set of mutated samples is # identical to that of a sub-type that was already found if mtype_sampsets[mtype] in use_sampsets: if args.verbose: print("Removing functionally duplicate MuType {}"\ .format(mtype)) else: use_mtypes.update({mtype}) use_sampsets.update({mtype_sampsets[mtype]}) if args.verbose: print("\nFound {} total sub-types!".format(len(use_mtypes))) # save the list of found non-duplicate sub-types to file pickle.dump( sorted(list(use_mtypes)), open(os.path.join( out_path, 'mtype_list__freq_{}__levels_{}.p'.format( args.freq_cutoff, args.mut_levels) ), 'wb') ) pickle.dump({'Samps': cdata.samples}, open(os.path.join(out_path, 'cohort_info.p'), 'wb')) with open(os.path.join( out_path, 'mtype_count__freq_{}__levels_{}.txt'.format( args.freq_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(use_mtypes)))
def main(argv): """Runs the experiment.""" syn = synapseclient.Synapse() syn.login() # load drug-mutation association data, # filter for pan-cancer associations drug_mut_assoc = pd.read_csv(base_dir + '/../../data/drugs/ioria/drug_anova.txt.gz', sep='\t', comment='#') if patient_cohs[argv[0]] in drug_mut_assoc.columns: drug_mut_assoc = drug_mut_assoc.ix[ drug_mut_assoc[patient_cohs[argv[0]]] != 0, :] else: drug_mut_assoc = drug_mut_assoc.ix[drug_mut_assoc['PANCAN'] != 0, :] # categorize associations by mutation type pnt_indx = drug_mut_assoc['FEAT'].str.contains('_mut$') # TODO: determine how iorio handled CNVs (they're currently ignored) cnv_indx = drug_mut_assoc['FEAT'].str.contains('^(?:loss|gain):') fus_indx = drug_mut_assoc['FEAT'].str.contains('_fusion$') # get list of genes affected by point mutations, load TCGA cohort # with corresponding set of mutations pnt_genes = list( set(x[0] for x in drug_mut_assoc['FEAT'][pnt_indx].str.split('_'))) print(len(pnt_genes)) # create a VariantCohort with expression only for genes which have # point mutations in the drug_mut_assoc dataframe # (cv_prop = cross validation proportion)(train on all here) # cross val seed is provided as last arg in an HTCondor submit script, and # cohort name is the first (should match cohort names as they appear in BMEG) tcga_var_coh = VariantCohort(syn, cohort="TCGA-{}".format( patient_cohs[argv[0]]), mut_genes=pnt_genes, mut_levels=['Gene', 'Type'], cv_seed=int(argv[-1]) + 1, cv_prop=1) tcga_back_cohs = { coh: VariantCohort(syn, cohort=coh, mut_genes=pnt_genes, mut_levels=['Gene', 'Type'], cv_seed=int(argv[-1]) + 1, cv_prop=1) for coh in tcga_backcohs } # TODO: recall why frameshifts aren't considered below # get list of point mutation types and drugs associated with at least one pnt_mtypes = [ MuType({('Gene', gn): { ('Type', ('Frame', 'Point')): None }}) for gn in pnt_genes ] pnt_muts = { (gn + '_mut'): mtype for gn, mtype in zip(pnt_genes, pnt_mtypes) # TODO: the get_samples argument should be a MuTree...right? if len(mtype.get_samples(tcga_var_coh.train_mut)) >= 5 } pnt_drugs = list( set(drug_mut_assoc['DRUG'][pnt_indx][ drug_mut_assoc['FEAT'][pnt_indx].isin(pnt_muts.keys())])) pnt_drugs.sort() print(len(pnt_drugs)) # ... stores predicted drug responses for cell lines and tcga samples ccle_response = {} tcga_response = {} back_tcga_resp = {coh: {} for coh in tcga_backcohs} # ... stores predicted drug response for organoid sample patient_response = pd.Series(float('nan'), index=pnt_drugs) # array that stores classifier performance on held-out cell lines clf_perf = pd.Series(float('nan'), index=pnt_drugs) # ... stores t-test p-values for mutation state vs predicted # drug responses in TCGA cohort tcga_ttest = pd.DataFrame(float('nan'), index=pnt_drugs, columns=pnt_muts.keys()) # ... stores AUC scores for mutation vs drug response in TCGA tcga_auc = pd.DataFrame(float('nan'), index=pnt_drugs, columns=pnt_muts.keys()) # loads patient (or patient-derived model (PDM)) RNAseq data patient_expr = pd.read_csv(patient_files[argv[0]], header=0, sep='\t') # get rid of the unnecessary info in gene_id, get Hugo symbols patient_expr['gene_id'] = [ i.split('^')[1] for i in patient_expr['gene_id'] ] annot_data = get_gencode() patient_expr['Symbol'] = [ annot_data[gn]['gene_name'] if gn in annot_data else 'no_gene' for gn in patient_expr['gene_id'] ] # ensure that there are no zeros in preparation for log normalization patient_expr.loc[:, 'FPKM'] = ( patient_expr.loc[:, 'FPKM'] + min(patient_expr.loc[:, 'FPKM'][patient_expr.loc[:, 'FPKM'] > 0]) / 2) # log normalize the FPKM values patient_expr.loc[:, 'FPKM'] = np.log2(patient_expr.loc[:, 'FPKM']) # combine multiple entries of same gene symbol (use their mean) patient_expr = patient_expr.groupby(['Symbol'])['FPKM'].mean() patient_expr = pd.DataFrame(patient_expr) for drug in pnt_drugs: drug_clf = eval(argv[1])() cell_line_drug_coh = DrugCohort(cohort='ioria', drug_names=[drug], cv_seed=int(argv[-1])) drug_lbl = cell_line_drug_coh.train_resp.columns[0] print("Testing drug {} with alias {} ...".format(drug, drug_lbl)) # TODO: 'Symbol' --> gene_id # get the union of genes in all 3 datasets (tcga, ccle, patient/PDM RNAseq use_genes = (set(tcga_var_coh.genes) & set(cell_line_drug_coh.genes) & set(patient_expr.index) & reduce(lambda x, y: x & y, [coh.genes for coh in tcga_back_cohs.values()])) # filter patient (or PDM) RNAseq data to include only use_genes patient_expr_filt = patient_expr.loc[use_genes, :] # TODO: does patient_expr_filtered need to be transposed? # tunes and fits the classifier on the CCLE data, and evaluates its # performance on the held-out samples pr = cProfile.Profile() pr.enable() drug_clf.tune_coh(cell_line_drug_coh, pheno=drug_lbl, tune_splits=4, test_count=16, include_genes=use_genes) drug_clf.fit_coh(cell_line_drug_coh, pheno=drug_lbl, include_genes=use_genes) pr.disable() s = io.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) print(drug_clf) clf_perf[drug] = drug_clf.eval_coh(cell_line_drug_coh, pheno=drug_lbl, include_genes=use_genes) # predicts drug response for the patient or PDM, stores classifier # for later use ccle_response[drug] = pd.Series( drug_clf.predict_train(cell_line_drug_coh, include_genes=use_genes)) tcga_response[drug] = pd.Series( drug_clf.predict_train(tcga_var_coh, include_genes=use_genes)) for coh in tcga_backcohs: back_tcga_resp[coh][drug] = pd.Series( drug_clf.predict_train(tcga_back_cohs[coh], include_genes=use_genes)) patient_response[drug] = drug_clf.predict( patient_expr_filt.transpose())[0] for gn, mtype in pnt_muts.items(): print("Gene: {}, Drug: {}".format(gn, drug)) # for each mutated gene, get the vector of mutation status # for the TCGA samples mut_stat = np.array(tcga_var_coh.train_pheno(mtype=mtype)) # gets the classifier's predictions of drug response for the # TCGA cohort, and evaluate its concordance with mutation status tcga_ttest.loc[drug, gn] = -log10( ttest_ind(tcga_response[drug][mut_stat], tcga_response[drug][~mut_stat], equal_var=False)[1]) tcga_auc.loc[drug, gn] = roc_auc_score(mut_stat, tcga_response[drug]) # save everything to file out_data = { 'Performance': clf_perf, 'CCLE_Response': ccle_response, 'TCGA_Response': tcga_response, 'back_TCGA_Response': back_tcga_resp, 'Patient_Response': patient_response, 'TCGA_ttest': tcga_ttest, 'TCGA_AUC': tcga_auc } out_file = ('/home/users/grzadkow/compbio/bergamot/HetMan/experiments/' 'drug_predictions/output/mat_' + argv[0] + '_' + argv[1] + '__run' + argv[-1] + '.p') pickle.dump(out_data, open(out_file, 'wb'))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description=("Test a classifier's ability to create a mutation " "signature for a gene that can be transferred from a " "TCGA cohort to ICGC PACA-AU.") ) parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') parser.add_argument('mtypes', type=str, help='a list of mutation types to test') parser.add_argument('cv_id', type=int, help='a random seed used for cross-validation') parser.add_argument('task_id', type=int, help=('the subset of TCGA cohorts and mutated genes ' 'to assign to this task')) parser.add_argument( '--tune_splits', type=int, default=4, help='how many training cohort splits to use for tuning' ) parser.add_argument( '--test_count', type=int, default=24, help='how many hyper-parameter values to test in each tuning split' ) parser.add_argument( '--parallel_jobs', type=int, default=8, help='how many parallel CPUs to allocate the tuning tests across' ) parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') args = parser.parse_args() if args.verbose: print("Starting ICGC transfer test with classifier {} on mutation " "type list `{}` for cross-validation ID {} and " "task ID {} ...".format(args.classif, args.mtypes, args.cv_id, args.task_id)) cohort_mtypes = sorted(pickle.load( open(os.path.join(base_dir, 'setup', 'cohort_{}.p'.format(args.mtypes)), 'rb'))) test_count = ceil(len(cohort_mtypes) / 6) cohort_mtypes = [x for i, x in enumerate(cohort_mtypes) if i // test_count == args.task_id] use_cohorts = set(coh for coh, _ in cohort_mtypes) mut_clf = eval(args.classif) out_acc = {cohort: dict() for cohort in use_cohorts} out_par = {cohort: dict() for cohort in use_cohorts} cdata_icgc = ICGCcohort('PACA-AU', icgc_data_dir, mut_genes=None, samp_cutoff=[1/12, 11/12], cv_prop=1.0) syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/mgrzad" "/input-data/synapse") syn.login() for cohort in use_cohorts: cur_mtypes = [mtype for coh, mtype in cohort_mtypes if coh == cohort] if args.mtypes == 'genes': cur_genes = cur_mtypes.copy() cur_mtypes = [MuType({('Gene', gn): None}) for gn in cur_genes] else: cur_genes = reduce( or_, [set(gn for gn, _ in mtype.subtype_list()) for mtype in cur_mtypes] ) tcga_cdata = TCGAcohort( cohort=cohort, mut_genes=cur_genes, mut_levels=['Gene', 'Form_base'], expr_source='toil', expr_dir=toil_dir, var_source='mc3', syn=syn, collapse_txs=True, cv_prop=0.75, cv_seed=(args.cv_id - 37) * 101 ) if args.verbose: print("Loaded mutations for {} genes in cohort {} with " "{} samples.".format(len(cur_genes), cohort, len(tcga_cdata.samples))) for mtype in cur_mtypes: if args.verbose: print("Testing {} in {} ...".format(mtype, cohort)) clf = mut_clf() use_genes = ((cdata_icgc.genes & tcga_cdata.genes) - set(gn for gn, _ in mtype.subtype_list())) clf.tune_coh(tcga_cdata, mtype, include_genes=use_genes, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) out_par[cohort][mtype] = {par: clf.get_params()[par] for par, _ in clf.tune_priors} clf.fit_coh(tcga_cdata, mtype, include_genes=use_genes) out_acc[cohort][mtype] = clf.eval_coh( cdata_icgc, mtype, include_genes=use_genes, use_train=True ) out_file = os.path.join(base_dir, 'output', args.classif, args.mtypes, 'out__cv-{}_task-{}.p'.format( args.cv_id, args.task_id) ) pickle.dump({'Acc': out_acc, 'Par': out_par, 'Info': {'TuneSplits': args.tune_splits, 'TestCount': args.test_count, 'ParallelJobs': args.parallel_jobs}}, open(out_file, 'wb'))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description='Set up searching for sub-types to detect.') # positional command line arguments parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') parser.add_argument('base_gene', type=str, help='a gene to cross with respect to') # optional command line arguments controlling the thresholds for which # individual mutations and how many genes' mutations are considered parser.add_argument('--freq_cutoff', type=int, default=20, help='sub-type sample frequency threshold') parser.add_argument('--max_genes', type=int, default=200, help='maximum number of mutated genes to consider') # optional command line arguments for what kinds of mutation sub-types to # look for in terms of properties and number of mutations to combine parser.add_argument( '--mut_levels', type=str, nargs='+', default=['Form_base', 'Exon', 'Protein'], help='the mutation property levels to consider in addition to `Genes`') parser.add_argument( '--comb_size', type=int, default=2, help='maximum number of individual mutations to combine' 'when searching for mutation sub-types') # optional command line argument controlling verbosity parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse the command line arguments, get the directory where found sub-types # will be saved for future use args = parser.parse_args() out_path = os.path.join(base_dir, 'output', args.cohort, args.classif, 'cross', args.base_gene) if args.verbose: print("Looking for mutation sub-types in cohort {} composed of at " "most {} individual mutations with at least {} " "samples in total.\n".format(args.cohort, args.comb_size, args.freq_cutoff)) # log into Synapse using locally-stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # load the expression matrix for the given cohort from Broad Firehose, # load the MC3 variant call set from Synapse, find the mutations for the # samples that are in both datasets expr_data = get_expr_firehose(args.cohort, firehose_dir) mc3_data = get_variants_mc3(syn) expr_mc3 = mc3_data.loc[mc3_data['Sample'].isin(expr_data.index), :] # get the genes whose mutations appear in enough samples to pass the # frequency threshold gene_counts = expr_mc3.groupby(by='Gene').Sample.nunique() count_cutoff = int(args.freq_cutoff / args.comb_size) common_genes = set(gene_counts.index[gene_counts >= count_cutoff]) if args.verbose: print("Found {} candidate genes with at least {} potential " "mutated samples.".format(len(common_genes), count_cutoff)) if len(common_genes) >= args.max_genes: gene_counts = gene_counts[common_genes].sort_values(ascending=False) common_genes = set(gene_counts[:args.max_genes].index) if args.verbose: print("Too many genes found, culling list to {} genes which each " "have at least {} mutated samples.".format( args.max_genes, min(gene_counts[common_genes]))) cdata = VariantCohort(cohort=args.cohort, mut_genes=common_genes, mut_levels=['Gene'] + args.mut_levels, expr_source='Firehose', data_dir=firehose_dir, cv_prop=1.0, syn=syn) base_mtype = MuType({('Gene', args.base_gene): None}) base_samps = base_mtype.get_samples(cdata.train_mut) with_muts = deepcopy(cdata.train_mut).subtree(base_samps) without_muts = deepcopy(cdata.train_mut).subtree(cdata.samples - base_samps) # intializes the list of found sub-types and the list of samples each # sub-type appears in use_mtypes = set() use_sampsets = set() search_level = 1 break_status = False # until we have not reached the limit of sub-type enumeration or run out # property level combinations to test... while (len(use_mtypes) < 10000 and not break_status and search_level <= 2**len(args.mut_levels)): # try a list of property level combinations and number of individual # variants to combine, where the complexity of the level combination # plus the variant count is held constant for lvl_combn, comb_size in zip( rev_powerset_slice(args.mut_levels, search_level), range(1, min(search_level + 1, args.comb_size + 1))): use_lvls = ['Gene'] + list(lvl_combn) if args.verbose: print("\nLooking for sub-types that are combinations " "of {} mutation(s) at levels {}...\n".format( comb_size, use_lvls)) # enumerates the sub-types consisting of a combination of the given # number of individual mutations at the given property levels sub_mtypes = with_muts.combtypes(comb_sizes=(comb_size, ), sub_levels=use_lvls, min_type_size=int( args.freq_cutoff / 2)) sub_mtypes |= without_muts.combtypes(comb_sizes=(comb_size, ), sub_levels=use_lvls, min_type_size=int( args.freq_cutoff / 2)) # finds the samples belonging to each enumerated sub-type that # hasn't already been found mtype_sampsets = { mtype: frozenset(mtype.get_samples(cdata.train_mut)) for mtype in sub_mtypes - use_mtypes if (mtype & base_mtype).is_empty() } # removes the sub-types with so many mutated samples that there # are not enough negatively-labelled samples for classification mtype_sampsets = { mtype: sampset for mtype, sampset in mtype_sampsets.items() if len(sampset) <= (len(cdata.samples) - args.freq_cutoff) } sub_mtypes = sorted(list(mtype_sampsets)) if args.verbose: print("Found {} new sub-types!\n".format(len(sub_mtypes))) # if the list of remaining sub-types isn't too long... if len(sub_mtypes) < 8000: add_mtypes = set() for i, mtype in enumerate(sub_mtypes): if args.verbose and (i % 200) == 100: print("\nchecked {} sub-types\n".format(i)) # ...we remove each one whose set of mutated samples is # identical to that of a sub-type that was already found if mtype_sampsets[mtype] in use_sampsets: if args.verbose: print("Removing functionally duplicate MuType {}"\ .format(mtype)) else: add_mtypes.update({mtype}) use_sampsets.update({mtype_sampsets[mtype]}) use_mtypes |= add_mtypes elif len(sub_mtypes) > 100000: break_status = True search_level += 1 if args.verbose: print("\nFound {} total sub-types!".format(len(use_mtypes))) # save the list of found non-duplicate sub-types to file pickle.dump(sorted(list(use_mtypes)), open(os.path.join(out_path, 'tmp/mtype_list.p'), 'wb'))