def main(): parser = argparse.ArgumentParser( "Plots the clustering done by an unsupervised learning method on a " "TCGA cohort with molecular subtypes highlighted.") parser.add_argument('out_dir', type=str) parser.add_argument('transform', type=str, choices=list(clust_algs.keys()), help='an unsupervised learning method') parser.add_argument('--use_seed', type=int, default=1301) args = parser.parse_args() np.random.seed(args.use_seed) cdata = merge_cohort_data(args.out_dir) type_data = pd.read_csv(type_file, sep='\t', index_col=0, comment='#') if '_' in cdata.cohort: use_cohort = cdata.cohort.split('_')[0] else: use_cohort = cdata.cohort if use_cohort not in type_data.DISEASE.values: raise ValueError("The source of this cohort ({}) does not " "match those present in the TCGA subtypes " "file!".format(use_cohort)) type_data = type_data[type_data.DISEASE == use_cohort] trans_expr = clust_algs[args.transform].fit_transform_coh(cdata) plot_clustering(trans_expr.copy(), args, cdata, type_data)
def main(): parser = argparse.ArgumentParser( "Plots the performance and tuning characteristics of a Stan model in " "classifying the mutation status of the genes in a given cohort.") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") parser.add_argument('gene', type=str, help="a mutated gene") parser.add_argument('model_name', type=str, help="which mutation classifier was tested") args = parser.parse_args() out_tag = "{}__{}".format(args.cohort, args.gene) os.makedirs(os.path.join(plot_dir, out_tag, args.model_name.split('__')[0]), exist_ok=True) cdata = merge_cohort_data(os.path.join(base_dir, out_tag)) with open( os.path.join(base_dir, out_tag, "out-data__{}.p".format(args.model_name)), 'rb') as fl: out_dict = pickle.load(fl) plot_auc_distribution(out_dict['Fit']['Acc'], args) plot_generalization_error(out_dict['Fit']['Acc'], args) plot_tuning_profile(out_dict['Tune']['Acc'], args, cdata)
def main(): parser = argparse.ArgumentParser("Plots general information about a " "particular run of the experiment.") parser.add_argument('expr_source', type=str, help="which TCGA expression data source was used") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") parser.add_argument( 'samp_cutoff', type=int, help="minimum number of mutated samples needed to test a gene") parser.add_argument('model_name', type=str, help="which mutation classifier was tested") args = parser.parse_args() out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort, args.samp_cutoff) os.makedirs(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name.split('__')[0]), exist_ok=True) cdata = merge_cohort_data(os.path.join(base_dir, out_tag)) with bz2.BZ2File( os.path.join(base_dir, out_tag, "out-data__{}.p.gz".format(args.model_name)), 'r') as fl: out_dict = pickle.load(fl) plot_label_stability(out_dict['Scores'], out_dict['Fit']['test'].AUC, args, cdata) plot_label_correlation(out_dict['Scores'], out_dict['Fit']['test'].AUC, args, cdata) plot_auc_distribution(out_dict['Fit']['test'].AUC, args) plot_acc_quartiles(out_dict['Fit']['test'].AUC, out_dict['Fit']['test'].AUPR, args, cdata) plot_tuning_mtype(out_dict['Params'], out_dict['Fit']['test'].AUC, out_dict['Clf'], args, cdata) if len(out_dict['Clf'].tune_priors) > 1: plot_tuning_mtype_grid(out_dict['Params'], out_dict['Fit']['test'].AUC, out_dict['Clf'], args, cdata)
def main(): parser = argparse.ArgumentParser( "Plots the success of all models tested in predicting the presence " "of the mutations in a given cohort.") # parse command-line arguments, create directory to store the plots parser.add_argument('cohort', type=str, help="which TCGA cohort was used") args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) # search for experiment output directories corresponding to this cohort out_datas = [ out_file.parts[-2:] for out_file in Path(base_dir).glob( "*__{}__samps-*/out-data__*.p.gz".format(args.cohort)) ] # get the experiment output directory for each combination of input # expression source and algorithm with the lowest sample incidence cutoff out_use = pd.DataFrame([{ 'Source': '__'.join(out_data[0].split('__')[:-2]), 'Samps': int(out_data[0].split('__samps-')[1]), 'Model': out_data[1].split('out-data__')[1].split('.p')[0] } for out_data in out_datas]).groupby( ['Model', 'Source'])['Samps'].min().reset_index('Model').set_index('Samps', append=True) # load the cohort expression and mutation data for each combination of # expression source and sample cutoff cdata_dict = {(src, ctf): merge_cohort_data( os.path.join(base_dir, "{}__{}__samps-{}".format(src, args.cohort, ctf))) for src, ctf in set(out_use.index)} # load the experiment output for each combination of source and cutoff out_dict = {(src, mdl.values[0]): pickle.load( bz2.BZ2File( os.path.join(base_dir, "{}__{}__samps-{}".format(src, args.cohort, ctf), "out-data__{}.p.gz".format(mdl.values[0])), 'r')) for (src, ctf), mdl in out_use.iterrows()} # create the plots plot_auc_highlights(out_dict.copy(), args, cdata_dict) plot_aupr_time(out_dict.copy(), args)
def main(): parser = argparse.ArgumentParser( "Plots the performance and tuning characteristics of a model in " "classifying the copy number scores of the genes in a given cohort.") parser.add_argument('expr_source', type=str, help="which TCGA expression data source was used") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") parser.add_argument( 'samp_cutoff', type=int, help="minimum number of mutated samples needed to test a gene") parser.add_argument('model_name', type=str, help="which mutation classifier was tested") args = parser.parse_args() out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort, args.samp_cutoff) os.makedirs(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name.split('__')[0]), exist_ok=True) cdata = merge_cohort_data(os.path.join(base_dir, out_tag)) with bz2.BZ2File( os.path.join(base_dir, out_tag, "out-data__{}.p.gz".format(args.model_name)), 'r') as fl: out_dict = pickle.load(fl) plot_label_stability(out_dict['Scores'], out_dict['Fit']['test'].Cor, args, cdata) plot_label_correlation(out_dict['Scores'], out_dict['Fit']['test'].Cor, args, cdata, plot_dir) plot_cor_distribution(out_dict['Fit']['test'].Cor, args) plot_tuning_gene(out_dict['Params'], out_dict['Fit']['test'].Cor, out_dict['Clf'], args, cdata) if len(out_dict['Clf'].tune_priors) > 1: plot_tuning_gene_grid(out_dict['Params'], out_dict['Fit']['test'].Cor, out_dict['Clf'], args, cdata)
def main(): parser = argparse.ArgumentParser("Plots the relationships between the " "outputs of mutation prediction models " "tested in a given cohort's dataset.") parser.add_argument('expr_source', type=str, help="which TCGA expression data source was used") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") args = parser.parse_args() os.makedirs(os.path.join(plot_dir, args.expr_source), exist_ok=True) out_datas = [ out_file.parts[-2:] for out_file in Path( base_dir).glob("{}__{}__samps-*/out-data__*.p.gz".format( args.expr_source, args.cohort)) ] out_use = pd.DataFrame([{ 'Samps': int(out_data[0].split('__samps-')[1]), 'Model': out_data[1].split('out-data__')[1].split('.p')[0] } for out_data in out_datas]).groupby(['Model'])['Samps'].min() cdata_dict = { ctf: merge_cohort_data( os.path.join( base_dir, "{}__{}__samps-{}".format(args.expr_source, args.cohort, ctf))) for ctf in set(out_use) } out_dict = { mdl: pickle.load( bz2.BZ2File( os.path.join( base_dir, "{}__{}__samps-{}".format(args.expr_source, args.cohort, ctf), "out-data__{}.p.gz".format(mdl)), 'r')) for mdl, ctf in out_use.iteritems() } # create the plots plot_model_correlation(out_dict.copy(), args, cdata_dict)
def main(): parser = argparse.ArgumentParser( "Plots the performance and tuning characteristics of a model in " "classifying the copy number scores of the genes in a given cohort." ) parser.add_argument('expr_source', type=str, help="which TCGA expression data source was used") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") parser.add_argument('model_name', type=str, help="which mutation classifier was tested") args = parser.parse_args() os.makedirs(os.path.join( plot_dir, '__'.join([args.expr_source, args.cohort]), args.model_name.split('__')[0] ), exist_ok=True) use_ctf = min( int(out_file.parts[-2].split('__samps-')[1]) for out_file in Path(base_dir).glob( "{}__{}__samps-*/out-data__{}.p.gz".format( args.expr_source, args.cohort, args.model_name) ) ) out_tag = "{}__{}__samps-{}".format( args.expr_source, args.cohort, use_ctf) cdata = merge_cohort_data(os.path.join(base_dir, out_tag)) with bz2.BZ2File(os.path.join(base_dir, out_tag, "out-data__{}.p.gz".format( args.model_name)), 'r') as fl: out_dict = pickle.load(fl) plot_generalization_error(out_dict['Fit']['train'].Cor, out_dict['Fit']['test'].Cor, args) plot_tuning_distribution(out_dict['Params'], out_dict['Fit']['test'].Cor, out_dict['Clf'], args, cdata) plot_tuning_profile(out_dict['Tune']['Acc'], out_dict['Clf'], args, cdata) if len(out_dict['Clf'].tune_priors) == 2: plot_tuning_profile_grid(out_dict['Tune']['Acc'], out_dict['Clf'], args, cdata)
def main(): parser = argparse.ArgumentParser( "Plots the distributions of the labels assigned by a copy number " "alteration score regressor for a set of genetic features.") parser.add_argument('expr_source', type=str, help="which TCGA expression data source was used") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") parser.add_argument( 'samp_cutoff', type=int, help="minimum number of mutated samples needed to test a gene") parser.add_argument('model_name', type=str, help="which mutation classifier was tested") args = parser.parse_args() out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort, args.samp_cutoff) os.makedirs(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name), exist_ok=True) cdata = merge_cohort_data(os.path.join(base_dir, out_tag)) with bz2.BZ2File( os.path.join(base_dir, out_tag, "out-data__{}.p.gz".format(args.model_name)), 'r') as fl: out_dict = pickle.load(fl) auc_vals = out_dict['Fit']['test']['Cor'].quantile(q=0.25, axis=1) for gene in auc_vals.index[auc_vals > auc_vals.quantile(q=0.8)]: plot_label_distribution(gene, out_dict['Scores'], args, cdata)
def main(): parser = argparse.ArgumentParser( "Plot an example diagram showing how overlap with other types of " "mutations can affect a mutation classification task.") # parse command line arguments, create directory where plots will be saved parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('classif', help='a mutation classifier') args = parser.parse_args() os.makedirs(os.path.join(plot_dir, args.cohort), exist_ok=True) # search for experiment output directories corresponding to this cohort out_datas = [ out_file.parts[-2:] for out_file in Path(base_dir).glob( "{}__samps-*/out-data__{}.p".format(args.cohort, args.classif)) ] use_dir = out_datas[np.argmin( [int(out_data[0].split('__samps-')[1]) for out_data in out_datas])][0] cdata = merge_cohort_data(os.path.join(base_dir, use_dir), use_seed=671) # load inferred mutation relationship metrics generated by the experiment with open( os.path.join(base_dir, use_dir, "out-simil__{}.p".format(args.classif)), 'rb') as f: stat_dict, auc_dict, mutex_dict, siml_dict = pickle.load(f) gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0) use_genes = gene_df.index[(gene_df.loc[:, [ 'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT' ]] == 'Yes').sum(axis=1) >= 3] # find mutation pairs for which the classifier was able to successfully # predict the presence of each mutation in isolation from the other auc_df = (pd.DataFrame(auc_dict) >= 0.8).all(axis=0) use_mtypes = [ (mtype1, mtype2) for (mtype1, mtype2) in auc_df.index[auc_df] if (mtype1.subtype_list()[0][0] in use_genes and mtype2.subtype_list()[0][0] in use_genes and ( mtype1.subtype_list()[0][0] != mtype2.subtype_list()[0][0])) ] siml_df = pd.DataFrame({ 'Occur': pd.Series(mutex_dict)[use_mtypes], 'SimilMean': pd.Series({ mtypes: siml_dict[mtypes].loc['Other'].mean() for mtypes in use_mtypes }), 'SimilDiff': pd.Series({ mtypes: np.abs(siml_dict[mtypes].loc['Other'].diff()[1]) for mtypes in use_mtypes }), 'SynerMean': pd.Series({ mtypes: siml_dict[mtypes].loc['Both'].mean() for mtypes in use_mtypes }), 'SynerDiff': pd.Series({ mtypes: np.abs(siml_dict[mtypes].loc['Both'].diff()[1]) for mtypes in use_mtypes }), }) good_exs = { 'Conv': (siml_df.Occur * siml_df.SimilMean + siml_df.SimilDiff).sort_values(), 'Divr': (siml_df.Occur + siml_df.SimilMean - siml_df.SimilDiff).sort_values() } with open( os.path.join(base_dir, use_dir, "out-data__{}.p".format(args.classif)), 'rb') as f: out_infer = pickle.load(f)['Infer'].loc[use_mtypes] plot_base_classification(good_exs, stat_dict, out_infer, auc_dict, cdata, args)
def main(): parser = argparse.ArgumentParser( "Plots the performance of a model in predicting the presence of " "mutations in cohorts other than the one it was trained on.") parser.add_argument('expr_source', type=str, help="which TCGA expression data source was used") parser.add_argument('cohort', type=str, help="which TCGA cohort was used") parser.add_argument('model_name', type=str, help="which mutation classifier was tested") args = parser.parse_args() os.makedirs(os.path.join(plot_dir, '__'.join([args.expr_source, args.cohort]), args.model_name.split('__')[0]), exist_ok=True) use_ctf = min( int(out_file.parts[-2].split('__samps-')[1]) for out_file in Path( base_dir).glob("{}__{}__samps-*/out-data__{}.p.gz".format( args.expr_source, args.cohort, args.model_name))) out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort, use_ctf) cdata = merge_cohort_data(os.path.join(base_dir, out_tag)) with bz2.BZ2File( os.path.join(base_dir, out_tag, "out-data__{}.p.gz".format(args.model_name)), 'r') as fl: out_dict = pickle.load(fl) auc_vals = out_dict['Fit']['test'].AUC.quantile(q=0.25, axis=1) use_mtypes = auc_vals[auc_vals >= 0.7].index stat_dict = dict() for coh, trnsf_df in out_dict['Trnsf'].items(): stat_dict[coh] = dict() with open( os.path.join(os.environ['TEMPDIR'], 'HetMan', 'variant_baseline', args.expr_source, 'setup', "{}__cohort-data.p".format(coh)), 'rb') as f: trnsf_cdata = pickle.load(f) if coh in args.cohort: sub_stat = np.array([ smp in cdata.get_train_samples() for smp in trnsf_cdata.get_train_samples() ]) if (~sub_stat).any(): out_dict['Trnsf'][coh] = out_dict['Trnsf'][coh].iloc[ ~sub_stat, :] for mtype in use_mtypes: trnsf_stat = np.array(trnsf_cdata.train_pheno(mtype)) stat_dict[coh][mtype] = trnsf_stat[~sub_stat] assert (np.bincount(trnsf_stat[sub_stat]) == np.bincount( cdata.train_pheno(mtype))).all(), ( "{} cohort used for transfer learning does " "not match the one used for primary learning!") else: del (out_dict['Trnsf'][coh]) else: for mtype in use_mtypes: stat_dict[coh][mtype] = np.array( trnsf_cdata.train_pheno(mtype)) corr_df = pd.DataFrame.from_records({ coh: { mtype: { 'random': trnsf_vals[mtype].iloc[:, :25].corr(method='spearman'), 'fivefold': trnsf_vals[mtype].iloc[:, 25:50].corr(method='spearman'), 'all': trnsf_vals[mtype].corr(method='spearman'), } for mtype, mut_stat in stat_dict[coh].items() if mut_stat.sum() >= 20 } for coh, trnsf_vals in out_dict['Trnsf'].items() }) auc_df = pd.DataFrame.from_records({ coh: { mtype: (np.greater.outer(trnsf_vals[mtype].iloc[mut_stat, :-1], trnsf_vals[mtype].iloc[~mut_stat, :-1]).mean() + np.equal.outer(trnsf_vals[mtype].iloc[mut_stat, :-1], trnsf_vals[mtype].iloc[~mut_stat, :-1]).mean() / 2) for mtype, mut_stat in stat_dict[coh].items() if mut_stat.sum() >= 20 } for coh, trnsf_vals in out_dict['Trnsf'].items() }) auc_df = auc_df.iloc[:, ~auc_df.isna().all().values] auc_df['All'] = -1. for mtype in auc_df.index: mut_arr = [ trnsf_vals[mtype].iloc[stat_dict[coh][mtype], :-1] for coh, trnsf_vals in out_dict['Trnsf'].items() ] mut_vals = np.concatenate([vals.values.flatten() for vals in mut_arr]) wt_arr = [ trnsf_vals[mtype].iloc[~stat_dict[coh][mtype], :-1] for coh, trnsf_vals in out_dict['Trnsf'].items() ] wt_vals = np.concatenate([vals.values.flatten() for vals in wt_arr]) auc_df.loc[mtype, 'All'] = np.greater.outer(mut_vals, wt_vals).mean() auc_df.loc[mtype, 'All'] += np.equal.outer(mut_vals, wt_vals).mean() / 2 plot_transfer_aucs(auc_df, auc_vals, stat_dict, args) plot_label_stability(corr_df, auc_df, auc_vals, stat_dict, args) plot_auc_comparison(out_dict, stat_dict, auc_vals, args)