def get_cohort_data(cohort, use_gene, cv_seed=None, test_prop=0): syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() if cohort == 'beatAML': cdata = BeatAmlCohort(mut_levels=['Form', 'Exon', 'Protein'], mut_genes=[use_gene], expr_file=beatAML_files['expr'], samp_file=beatAML_files['samps'], syn=syn, annot_file=annot_file, cv_seed=cv_seed, test_prop=test_prop) else: cdata = MutationCohort(cohort=cohort.split('_')[0], mut_levels=['Form_base', 'Protein'], mut_genes=[use_gene], expr_source='Firehose', var_source='mc3', copy_source='Firehose', annot_file=annot_file, type_file=type_file, expr_dir=expr_dir, copy_dir=copy_dir, syn=syn, cv_seed=cv_seed, test_prop=test_prop, annot_fields=['transcript'], use_types=parse_subtypes(cohort)) return cdata
def get_cohort_data(expr_source, cohort, samp_cutoff, cv_prop=1.0, cv_seed=None): syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0) use_genes = gene_df.index[( gene_df. loc[:, ['Vogelstein', 'Sanger CGC', 'Foundation One', 'MSK-IMPACT']] == 'Yes').sum(axis=1) == 4] source_info = expr_source.split('__') source_base = source_info[0] collapse_txs = not (len(source_info) > 1 and source_info[1] == 'txs') cdata = MutationCohort(cohort=cohort, mut_genes=use_genes.tolist(), mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'], expr_source=source_base, var_source='mc3', copy_source='Firehose', annot_file=annot_file, expr_dir=expr_sources[expr_source], copy_dir=copy_dir, collapse_txs=collapse_txs, syn=syn, cv_prop=cv_prop, cv_seed=cv_seed) return cdata
def main(): parser = argparse.ArgumentParser() parser.add_argument('transform', type=str) parser.add_argument('cohort', type=str, help='a cohort in TCGA') parser.add_argument('gene', type=str) args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', expr_dir=firehose_dir, cv_prop=1.0, syn=syn) tune_params = (('fit__n_neighbors', (5, 10, 15)), ('fit__metric', ('euclidean', 'correlation', 'cosine', 'manhattan', 'chebyshev')), ('lbl', 'base2')) #tune_params = (('fit__learning_rate', (50, 200, 750)), # ('fit__perplexity', (5, 15, 30, 40, 50)), # ('lbl', 'base')) plot_tuning_gene(cdata, args, tune_params)
def main(): parser = argparse.ArgumentParser( "Plot the success of classifying a gene's CNA status in a given " "cohort using different cutoffs for determining CNA status." ) parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('gene', help='a mutated gene') parser.add_argument('classif', help='a mutation classifier') # parse command-line arguments, create directory where plots will be saved args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort( cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, copy_source='Firehose', copy_dir=copy_dir, copy_discrete=False, syn=syn, cv_prop=1.0 ) loss_df, gain_df = get_aucs( load_infer_output(os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif)), args, cdata ) plot_cutoff_aucs(loss_df, gain_df, args, cdata)
def get_cohort_data(syn, expr_source, cohort, samp_cutoff, cv_prop=1.0, cv_seed=None): gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0) use_genes = gene_df.index[( gene_df. loc[:, ['Vogelstein', 'Sanger CGC', 'Foundation One', 'MSK-IMPACT']] == 'Yes').all(axis=1)] cdata = MutationCohort(cohort=cohort, mut_genes=use_genes.tolist(), mut_levels=['Gene', 'Form_base', 'Protein'], expr_source=expr_source, var_source='mc3', copy_source='Firehose', annot_file=annot_file, expr_dir=expr_sources[expr_source], copy_dir=copy_dir, syn=syn, cv_prop=cv_prop, cv_seed=cv_seed) return cdata
def main(): parser = argparse.ArgumentParser( "Plot the ordering of the subtypes of a module of genes in a given " "cohort based on how their isolated expression signatures classify " "one another.") parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('classif', help='a mutation classifier') parser.add_argument('mut_levels', type=str, help='a set of mutation annotation levels') parser.add_argument('genes', type=str, nargs='+', help='a list of mutated genes') parser.add_argument('--samp_cutoff', type=int, default=20) # parse command-line arguments, create directory where plots will be saved args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=args.genes, mut_levels=['Gene'] + args.mut_levels.split('__'), expr_source='Firehose', expr_dir=expr_dir, var_source='mc3', copy_source='Firehose', domain_dir=domain_dir, annot_file=annot_file, syn=syn, cv_prop=1.0) pheno_dict, auc_list, simil_df = compare_scores( load_infer_output( os.path.join(base_dir, 'output', args.cohort, '_'.join(sorted(args.genes)), args.classif, 'samps_{}'.format(args.samp_cutoff), args.mut_levels)), cdata) simil_rank = simil_df.mean(axis=1) - simil_df.mean(axis=0) simil_order = [ mtypes for mtypes, _ in sorted(tuple(simil_rank.iteritems()), key=lambda k: (k[0][0].subtype_list()[0][0], k[1])) ] simil_df = simil_df.loc[simil_order, simil_order[::-1]] plot_singleton_ordering(simil_df.copy(), auc_list.copy(), pheno_dict.copy(), args) plot_singleton_clustering(simil_df.copy(), auc_list.copy(), pheno_dict.copy(), args) plot_all_clustering(simil_df.copy(), auc_list.copy(), args)
def get_cohorts(expr_source, cohorts, mut_levels, cv_prop=1.0, cv_seed=9078): syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0) use_genes = gene_df.index[(gene_df.loc[:, [ 'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT' ]] == 'Yes').sum(axis=1) >= 1] source_info = expr_source.split('__') source_base = source_info[0] collapse_txs = not (len(source_info) > 1 and source_info[1] == 'txs') cohorts_base = {cohort: cohort.split('_')[0] for cohort in cohorts} cdata_dict = { cohort: MutationCohort(cohort=cohorts_base[cohort], mut_genes=use_genes.tolist(), mut_levels=['Gene'] + mut_levels, expr_source=source_base, var_source='mc3', copy_source='Firehose', annot_file=annot_file, type_file=type_file, expr_dir=expr_sources[expr_source], copy_dir=copy_dir, collapse_txs=collapse_txs, syn=syn, cv_prop=cv_prop, cv_seed=cv_seed, annot_fields=['transcript'], use_types=parse_subtypes(cohort)) for cohort in cohorts } cdata = MutationConcatCohort(cohorts=list(cohorts_base.values()), mut_genes=use_genes.tolist(), mut_levels=['Gene'] + mut_levels, expr_source=source_base, var_source='mc3', copy_source='Firehose', annot_file=annot_file, type_file=type_file, expr_dir=expr_sources[expr_source], copy_dir=copy_dir, collapse_txs=collapse_txs, syn=syn, cv_prop=cv_prop, cv_seed=cv_seed, annot_fields=['transcript'], use_types={ cohorts_base[cohort]: parse_subtypes(cohort) for cohort in cohorts }) return cdata, cdata_dict
def main(): parser = argparse.ArgumentParser( "Plot the distributions of perturbation scores separated by mutation " "subtype status as inferred by a Stan mutation classifier trained on " "a gene in a given TCGA cohort." ) # positional command-line arguments regarding the Stan model used to # obtain the sample mutation scores parser.add_argument('model_name', type=str, help="label of a Stan model") parser.add_argument('solve_method', type=str, help=("method used to obtain estimates for the " "parameters of the model")) # positional command line arguments regarding the samples and the mutation # classification task on which the model was trained parser.add_argument('cohort', type=str, help="a TCGA cohort") parser.add_argument('gene', type=str, help="a mutated gene") parser.add_argument('mut_levels', nargs='*', default=['Form_base', 'Exon'], help="which mutation annotation levels to consider") # parse command line arguments, ensure directory where plots will be saved # exists, load inferred mutation scores from each cross-validation run args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) infer_mat = load_output(args.model_name, args.solve_method, args.cohort, args.gene) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ('/home/exacloud/lustre1/CompBio' '/mgrzad/input-data/synapse') syn.login() cdata = MutationCohort( cohort=args.cohort, mut_genes=[args.gene], mut_levels=args.mut_levels, expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3', syn=syn, cv_prop=1.0 ) for use_levels in chain.from_iterable( combinations(args.mut_levels, r) for r in range(1, len(args.mut_levels) + 1) ): plot_subtype_violins(infer_mat, args, cdata, use_levels) plot_subtype_stability(infer_mat, args, cdata, use_levels)
def main(): parser = argparse.ArgumentParser( "Plot the ordering of a gene's subtypes in a given cohort based on " "how their isolated expression signatures classify one another.") parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('classif', help='a mutation classifier') parser.add_argument('mut_levels', type=str, help='a set of mutation annotation levels') parser.add_argument('genes', type=str, nargs='+', help='a list of mutated genes') parser.add_argument('--samp_cutoff', type=int, default=25) # parse command-line arguments, create directory where plots will be saved args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=args.genes, mut_levels=['Gene'] + args.mut_levels.split('__'), expr_source='Firehose', expr_dir=firehose_dir, syn=syn, cv_prop=1.0) simil_df, auc_list = get_similarities( load_infer_output( os.path.join(base_dir, 'output', args.cohort, '_'.join(sorted(args.genes)), args.classif, 'samps_{}'.format(args.samp_cutoff), args.mut_levels)), args.genes, cdata) print(simil_df.shape) simil_rank = simil_df.mean(axis=1) - simil_df.mean(axis=0) simil_order = simil_rank.sort_values().index simil_df = simil_df.loc[simil_order, reversed(simil_order)] plot_singleton_ordering(simil_df.copy(), auc_list.copy(), args, cdata) plot_all_ordering(simil_df.copy(), auc_list.copy(), args, cdata)
def get_cohort_data(cohort, expr_source, cv_seed=None): syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0) use_genes = gene_df.index[(gene_df.loc[:, [ 'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT' ]] == 'Yes').sum(axis=1) > 1] if cohort == 'beatAML': if expr_source != 'toil__gns': raise ValueError("Only gene-level Kallisto calls are available " "for the beatAML cohort!") cdata = BeatAmlCohort(mut_levels=['Gene', 'Form_base', 'Protein'], mut_genes=use_genes.tolist(), expr_source=expr_source, expr_file=beatAML_files['expr'], samp_file=beatAML_files['samps'], syn=syn, annot_file=annot_file, cv_seed=cv_seed, test_prop=0) else: source_info = expr_source.split('__') source_base = source_info[0] collapse_txs = not (len(source_info) > 1 and source_info[1] == 'txs') cdata = MutationCohort(cohort=cohort.split('_')[0], mut_levels=['Gene', 'Form_base', 'Protein'], mut_genes=use_genes.tolist(), expr_source=source_base, var_source='mc3', copy_source='Firehose', annot_file=annot_file, type_file=type_file, expr_dir=expr_sources[source_base], copy_dir=copy_dir, collapse_txs=collapse_txs, syn=syn, cv_seed=cv_seed, test_prop=0, annot_fields=['transcript'], use_types=parse_subtypes(cohort)) return cdata
def get_cohort_data(expr_source, syn_root, cohort, samp_cutoff): syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() expr_dir = pd.read_csv( open(os.path.join(base_dir, 'expr_sources.txt'), 'r'), sep='\t', header=None, index_col=0 ).loc[expr_source].iloc[0] cdata = MutationCohort( cohort=cohort, mut_genes=None, mut_levels=['Gene'], cv_prop=1.0, expr_source=expr_source, expr_dir=expr_dir, var_source='mc3', syn=syn, samp_cutoff=samp_cutoff ) return cdata
def main(): parser = argparse.ArgumentParser( "Plot the distribution of labels by mutation subtype returned by a " "Stan classifier trained to predict all the mutations for a given " "gene in a TCGA cohort.") parser.add_argument('model_name', type=str, help="label of a Stan model") parser.add_argument('solve_method', type=str, help=("method used to obtain estimates for the " "parameters of the model")) parser.add_argument('cohort', type=str, help="a TCGA cohort") parser.add_argument('gene', type=str, help="a mutated gene") parser.add_argument('mut_levels', nargs='*', default=['Form_base', 'Exon'], help="which mutation annotation levels to consider") args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) infer_mat = load_output(args.model_name, args.solve_method, args.cohort, args.gene) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ('/home/exacloud/lustre1/CompBio' '/mgrzad/input-data/synapse') syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=args.mut_levels, expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3', syn=syn, cv_prop=1.0) for use_levels in chain.from_iterable( combinations(args.mut_levels, r) for r in range(1, len(args.mut_levels) + 1)): plot_subtype_expression(infer_mat, args, cdata, use_levels)
def get_cohort_data(cohort): syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0) use_genes = gene_df.index[(gene_df.loc[:, [ 'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT' ]] == 'Yes').sum(axis=1) >= 1] if cohort == 'beatAML': cdata = BeatAmlCohort(mut_levels=[ 'Gene', 'Form_base', 'Form', 'Exon', 'Location', 'Protein' ], mut_genes=use_genes.tolist(), expr_source='toil__gns', expr_file=beatAML_files['expr'], samp_file=beatAML_files['samps'], syn=syn, annot_file=annot_file, cv_seed=671, test_prop=0) else: cdata = MutationCohort(cohort=cohort.split('_')[0], mut_levels=[ 'Gene', 'Form_base', 'Form', 'Exon', 'Location', 'Protein' ], mut_genes=use_genes.tolist(), expr_source='Firehose', var_source='mc3', copy_source='Firehose', annot_file=annot_file, type_file=type_file, expr_dir=expr_dir, copy_dir=copy_dir, syn=syn, cv_seed=671, test_prop=0, annot_fields=['transcript'], use_types=parse_subtypes(cohort)) return cdata
def main(): parser = argparse.ArgumentParser( "Plot the positions predicted for each sample in a given cohort by a " "multi-task model trained on pairs of mutation subtypes of a gene in " "two-dimensional inferred label space." ) parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('gene', help='a mutated gene') parser.add_argument('mut_levels', help='a set of mutation annotation levels') parser.add_argument('model_name', help='a Stan multi-task learning model') parser.add_argument('solve_method', choices=['optim', 'variat', 'sampl'], help='method used to obtain Stan parameter estimates') # parse command-line arguments, create directory where plots will be saved args = parser.parse_args() os.makedirs( os.path.join(plot_dir, args.cohort, args.gene, args.mut_levels), exist_ok=True ) multi_df = load_infer_output(os.path.join( base_dir, 'output', args.cohort, args.gene, args.mut_levels, args.model_name, args.solve_method )) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'] + args.mut_levels.split('__'), expr_source='Firehose', expr_dir=firehose_dir, syn=syn, cv_prop=1.0) for (mtype1, mtype2), infer_vals in multi_df.iterrows(): plot_position(infer_vals, args, cdata, mtype1, mtype2)
def main(): parser = argparse.ArgumentParser( description='Plot experiment results for given mutation classifier.') parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('gene', help='a mutated gene') parser.add_argument('classif', help='a mutation classifier') parser.add_argument('mut_levels', default='Form_base__Exon') parser.add_argument('--samp_cutoff', default=20) # parse command-line arguments, create directory where plots will be saved args = parser.parse_args() os.makedirs(os.path.join(plot_dir, args.cohort, args.gene), exist_ok=True) prob_df = load_infer_output( os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif, 'samps_{}'.format(args.samp_cutoff), args.mut_levels)).applymap(np.mean) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=None, samp_cutoff=20, mut_levels=['Gene'] + args.mut_levels.split('__'), expr_source='Firehose', expr_dir=firehose_dir, syn=syn, cv_prop=1.0) singl_mtypes = [ mtype for mtype in prob_df.index if len(mtype.subkeys()) == 1 ] for singl_mtype in singl_mtypes: plot_mtype_positions(prob_df.loc[singl_mtype, :], args, cdata)
def main(): parser = argparse.ArgumentParser( "Plot the inferred CNA scores for a cohort's samples against their " "actual CNA scores for a given set of cutoffs.") parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('gene', help='a mutated gene') parser.add_argument('classif', help='a mutation classifier') # parse command-line arguments, create directory where plots will be saved args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, copy_source='Firehose', copy_dir=copy_dir, copy_discrete=False, syn=syn, cv_prop=1.0) iso_df = load_infer_output( os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif)) loss_df, gain_df = get_aucs(iso_df, args, cdata) plot_cna_scores(iso_df.loc[loss_df['CNA'].idxmax(), :], args, cdata) plot_cna_scores(iso_df.loc[gain_df['CNA'].idxmax(), :], args, cdata) plot_cna_scores(iso_df.loc[(loss_df['CNA'] - loss_df['Mut']).idxmax(), :], args, cdata)
def main(): parser = argparse.ArgumentParser( "Plot the distributions of gene weight coefficients inferred by a " "given Stan classifier trained to predict the mutation status of a " "gene in a given TCGA cohort." ) parser.add_argument('model_name', type=str, help="label of a Stan model") parser.add_argument('solve_method', type=str, help=("method used to obtain estimates for the " "parameters of the model")) parser.add_argument('cohort', type=str, help="a TCGA cohort") parser.add_argument('gene', type=str, help="a mutated gene") args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) vars_dict = load_vars(args.model_name, args.solve_method, args.cohort, args.gene) if 'gn_wghts' not in vars_dict: raise ValueError("Can only plot inferred gene weights for a model " "that includes them as variables!") # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ('/home/exacloud/lustre1/CompBio' '/mgrzad/input-data/synapse') syn.login() cdata = MutationCohort( cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3', syn=syn, cv_prop=1.0 ) wghts_df = pd.DataFrame(vars_dict['gn_wghts'], index=sorted(cdata.genes - {args.gene})) plot_weights_cov(wghts_df, args, cdata)
def main(): parser = argparse.ArgumentParser( "Plots the clustering done by an unsupervised learning method on a " "TCGA cohort with subtypes of particular genes highlighted.") parser.add_argument('cohort', type=str, help='a cohort in TCGA') parser.add_argument('transform', type=str, help='an unsupervised learning method') parser.add_argument('mut_levels', type=str, help='a set of mutation annotation levels') parser.add_argument('--genes', type=str, nargs='+', default=['TP53'], help='a list of mutated genes') args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=args.genes, mut_levels=['Gene'] + args.mut_levels.split('__'), expr_source='Firehose', expr_dir=firehose_dir, cv_prop=1.0, syn=syn) mut_trans = eval(args.transform)() trans_expr = mut_trans.fit_transform_coh(cdata) for gene in args.genes: plot_subtype_clustering(trans_expr.copy(), args, cdata, gene)
def main(): parser = argparse.ArgumentParser( "Plot how well expression signatures separate isolated mutation " "subtypes from non-mutated samples relative to how they separate " "mutated samples not belonging to the subtype.") parser.add_argument('cohort', help='a TCGA cohort') parser.add_argument('gene', help='a mutated gene') parser.add_argument('classif', help='a mutation classifier') parser.add_argument('mut_levels', default='Form_base__Exon', help='a set of mutation annotation levels') parser.add_argument('--samp_cutoff', type=int, default=20) args = parser.parse_args() os.makedirs(plot_dir, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=args.mut_levels.split('__'), expr_source='Firehose', expr_dir=firehose_dir, syn=syn, cv_prop=1.0) infer_df = load_infer_output( os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif, 'samps_{}'.format(args.samp_cutoff), args.mut_levels)) auc_vals, sep_vals, prop_vals = get_separation(infer_df, args, cdata) plot_separation(auc_vals, sep_vals, prop_vals, args, cdata)
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description='Set up touring for sub-types to detect.' ) parser.add_argument('cohort', type=str, help="which TCGA cohort to use") # optional command line arguments controlling the thresholds for which # individual mutations and how many genes' mutations are considered parser.add_argument('--freq_cutoff', type=float, default=0.02, help='subtype sample frequency threshold') # optional command line arguments for what kinds of mutation sub-types to # look for in terms of properties and number of mutations to combine parser.add_argument('--mut_levels', type=str, default='Gene', help='the mutation property levels to consider') # optional command line argument controlling verbosity parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse the command line arguments, get the directory where found sub-types # will be saved for future use args = parser.parse_args() out_path = os.path.join(base_dir, 'setup', args.cohort) os.makedirs(out_path, exist_ok=True) use_lvls = args.mut_levels.split('__') # log into Synapse using locally-stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort( cohort=args.cohort, mut_genes=None, mut_levels=use_lvls, expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, cv_prop=1.0, samp_cutoff=args.freq_cutoff, syn=syn ) if args.verbose: print("Found {} candidate genes with mutations in at least " "{:.1f}% of the samples in TCGA cohort {}.\nLooking for " "subtypes of these genes that are combinations of up to two " "mutations at annotation levels {} ...\n".format( len(tuple(cdata.train_mut)), args.freq_cutoff * 100, args.cohort, use_lvls ) ) min_samps = args.freq_cutoff * len(cdata.samples) if use_lvls == ['Gene']: use_mtypes = {MuType({('Gene', gn): None}) for gn, mut in cdata.train_mut if len(mut) >= min_samps} elif use_lvls[0] == 'Gene': use_lvls = use_lvls[1:] use_mtypes = set() use_sampsets = set() mtype_sampsets = dict() for gn, mut in cdata.train_mut: cur_mtypes = { MuType({('Gene', gn): mtype}) for mtype in mut.combtypes(comb_sizes=(1, 2), sub_levels=use_lvls, min_type_size=min_samps) } # finds the samples belonging to each enumerated sub-type that # hasn't already been found cur_sampsets = { mtype: frozenset(mtype.get_samples(cdata.train_mut)) for mtype in cur_mtypes - use_mtypes} # removes the sub-types with so many mutated samples that there # are not enough negatively-labelled samples for classification mtype_sampsets.update({ mtype: sampset for mtype, sampset in cur_sampsets.items() if len(sampset) <= (len(cdata.samples) - min_samps) }) # ensures that when two sub-types have the same samples the one # further down the sort order gets removed sub_mtypes = sorted(list(mtype_sampsets)) if args.verbose: print("Found {} new sub-types!\n".format(len(sub_mtypes))) for i, mtype in enumerate(sub_mtypes): if args.verbose and (i % 200) == 100: print("\nchecked {} sub-types\n".format(i)) # ...we remove each one whose set of mutated samples is # identical to that of a sub-type that was already found if mtype_sampsets[mtype] in use_sampsets: if args.verbose: print("Removing functionally duplicate MuType {}"\ .format(mtype)) else: use_mtypes.update({mtype}) use_sampsets.update({mtype_sampsets[mtype]}) else: cur_mtypes = cdata.train_mut.combtypes(comb_sizes=(1, 2), sub_levels=use_lvls, min_type_size=min_samps) use_mtypes = set() use_sampsets = set() mtype_sampsets = dict() cur_sampsets = {mtype: frozenset(mtype.get_samples(cdata.train_mut)) for mtype in cur_mtypes - use_mtypes} # removes the sub-types with so many mutated samples that there # are not enough negatively-labelled samples for classification mtype_sampsets.update({ mtype: sampset for mtype, sampset in cur_sampsets.items() if len(sampset) <= (len(cdata.samples) - min_samps) }) # ensures that when two sub-types have the same samples the one # further down the sort order gets removed sub_mtypes = sorted(list(mtype_sampsets)) if args.verbose: print("Found {} new sub-types!\n".format(len(sub_mtypes))) for i, mtype in enumerate(sub_mtypes): if args.verbose and (i % 200) == 100: print("\nchecked {} sub-types\n".format(i)) # ...we remove each one whose set of mutated samples is # identical to that of a sub-type that was already found if mtype_sampsets[mtype] in use_sampsets: if args.verbose: print("Removing functionally duplicate MuType {}"\ .format(mtype)) else: use_mtypes.update({mtype}) use_sampsets.update({mtype_sampsets[mtype]}) if args.verbose: print("\nFound {} total sub-types!".format(len(use_mtypes))) # save the list of found non-duplicate sub-types to file pickle.dump( sorted(list(use_mtypes)), open(os.path.join( out_path, 'mtype_list__freq_{}__levels_{}.p'.format( args.freq_cutoff, args.mut_levels) ), 'wb') ) pickle.dump({'Samps': cdata.samples}, open(os.path.join(out_path, 'cohort_info.p'), 'wb')) with open(os.path.join( out_path, 'mtype_count__freq_{}__levels_{}.txt'.format( args.freq_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(use_mtypes)))
def main(): parser = argparse.ArgumentParser( "Set up the gene subtype expression effect isolation experiment by " "enumerating the subtypes to be tested.") # create positional command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('gene', type=str, help="which gene to consider") parser.add_argument('mut_levels', type=str, help="the mutation property levels to consider") # create optional command line arguments parser.add_argument('--samp_cutoff', type=int, default=20, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found subtypes # will be stored args = parser.parse_args() use_lvls = args.mut_levels.split('__') out_path = os.path.join(base_dir, 'setup', args.cohort, args.gene) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # load expression and variant call data for the given TCGA cohort cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=use_lvls, expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, cv_prop=1.0, syn=syn) if args.verbose: print("Looking for combinations of subtypes of mutations in gene {} " "present in at least {} of the samples in TCGA cohort {} at " "annotation levels {}.\n".format(args.gene, args.samp_cutoff, args.cohort, use_lvls)) # find mutation subtypes present in enough samples in the TCGA cohort iso_mtypes = cdata.train_mut.find_unique_subtypes( max_types=1000, max_combs=5, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) # filter out the subtypes that appear in too many samples for there to # be a wild-type class of sufficient size for classification use_mtypes = { mtype for mtype in iso_mtypes if (len(mtype.get_samples(cdata.train_mut)) <= (len(cdata.samples) - args.samp_cutoff)) } if args.verbose: print("\nFound {} total sub-types to isolate!".format(len(use_mtypes))) # save the list of found non-duplicate subtypes to file pickle.dump( sorted(use_mtypes), open( os.path.join( out_path, 'mtypes_list__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) # save the number of found subtypes to file with open( os.path.join( out_path, 'mtypes_count__samps_{}__levels_{}.txt'.format( args.samp_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(use_mtypes)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('expr_source', type=str, choices=['Firehose', 'toil', 'toil_tx'], help='which TCGA expression data source to use') parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument( 'syn_root', type=str, help="the root cache directory for data downloaded from Synapse" ) parser.add_argument( 'samp_cutoff', type=int, help="minimum number of mutated samples needed to test a gene" ) parser.add_argument('classif', type=str, help='the name of a mutation classifier') parser.add_argument( '--cv_id', type=int, default=6732, help='the random seed to use for cross-validation draws' ) parser.add_argument( '--task_count', type=int, default=10, help='how many parallel tasks the list of types to test is split into' ) parser.add_argument('--task_id', type=int, default=0, help='the subset of subtypes to assign to this task') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command-line arguments, create directory where to save results args = parser.parse_args() out_path = os.path.join( base_dir, 'output', args.expr_source, '{}__samps-{}'.format(args.cohort, args.samp_cutoff), args.classif ) gene_list = pickle.load( open(os.path.join(base_dir, "setup", "genes-list_{}__{}__samps-{}.p".format( args.expr_source, args.cohort, args.samp_cutoff )), 'rb') ) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = args.syn_root syn.login() expr_dir = pd.read_csv( open(os.path.join(base_dir, 'expr_sources.txt'), 'r'), sep='\t', header=None, index_col=0 ).loc[args.expr_source].iloc[0] cdata = MutationCohort( cohort=args.cohort, mut_genes=gene_list, mut_levels=['Gene'], expr_source=args.expr_source, expr_dir=expr_dir, var_source='mc3', syn=syn, cv_prop=0.75, cv_seed=2079 + 57 * args.cv_id ) clf_info = args.classif.split('__') clf_module = import_module( 'HetMan.experiments.gene_baseline.models.{}'.format(clf_info[0])) mut_clf = getattr(clf_module, clf_info[1].capitalize()) out_auc = {mut_gene: None for mut_gene in gene_list} out_aupr = {mut_gene: None for mut_gene in gene_list} out_params = {mut_gene: None for mut_gene in gene_list} out_time = {mut_gene: None for mut_gene in gene_list} for i, mut_gene in enumerate(gene_list): if (i % args.task_count) == args.task_id: if args.verbose: print("Testing {} ...".format(mut_gene)) clf = mut_clf() mtype = MuType({('Gene', mut_gene): None}) clf.tune_coh(cdata, mtype, exclude_genes={mut_gene}, tune_splits=4, test_count=24, parallel_jobs=16) out_params[mut_gene] = {par: clf.get_params()[par] for par, _ in mut_clf.tune_priors} t_start = time.time() clf.fit_coh(cdata, mtype, exclude_genes={mut_gene}) t_end = time.time() out_time[mut_gene] = t_end - t_start test_omics, test_pheno = cdata.test_data( mtype, exclude_genes={mut_gene}) pred_scores = clf.predict_omic(test_omics) if len(set(test_pheno)) == 2: out_auc[mut_gene] = roc_auc_score(test_pheno, pred_scores) out_aupr[mut_gene] = average_precision_score( test_pheno, pred_scores) else: out_auc[mut_gene] = 0.5 out_aupr[mut_gene] = len(mtype.get_samples(cdata.train_mut)) out_aupr[mut_gene] /= len(cdata.train_samps) else: del(out_auc[mut_gene]) del(out_aupr[mut_gene]) del(out_params[mut_gene]) del(out_time[mut_gene]) pickle.dump( {'AUC': out_auc, 'AUPR': out_aupr, 'Clf': mut_clf, 'Params': out_params, 'Time': out_time}, open(os.path.join(out_path, 'out__cv-{}_task-{}.p'.format( args.cv_id, args.task_id)), 'wb') )
def main(): parser = argparse.ArgumentParser( "Set up the gene subtype expression effect cross-isolation " "experiment by enumerating the pairs of subtypes to be tested.") # create positional command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('gene', type=str, help="which gene to consider") parser.add_argument('mut_levels', type=str, help='the mutation property levels to consider') # create optional command line arguments parser.add_argument('--samp_cutoff', type=int, default=25, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found subtypes # will be stored args = parser.parse_args() use_lvls = args.mut_levels.split('__') out_path = os.path.join(base_dir, 'setup', args.cohort, args.gene) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=use_lvls, expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, cv_prop=1.0, syn=syn) if args.verbose: print("Looking for combinations of subtypes of mutations in gene {} " "present in at least {} of the samples in TCGA cohort {} at " "annotation levels {}.\n".format(args.gene, args.samp_cutoff, args.cohort, use_lvls)) cross_mtypes = cdata.train_mut.find_unique_subtypes( max_types=100, max_combs=10, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) mtype_samps = { mtype: mtype.get_samples(cdata.train_mut) for mtype in cross_mtypes } cross_mtypes = { mtype for mtype in cross_mtypes if len(mtype_samps[mtype]) <= (len(cdata.samples) - args.samp_cutoff) } if args.verbose: print("\nFound {} total sub-types to cross!".format(len(cross_mtypes))) use_pairs = { (mtype1, mtype2) for mtype1, mtype2 in combn(cross_mtypes, 2) if ((len(mtype_samps[mtype1] - mtype_samps[mtype2]) >= args.samp_cutoff ) and (len(mtype_samps[mtype2] - mtype_samps[mtype1]) >= args.samp_cutoff) and ( len(mtype_samps[mtype1] | mtype_samps[mtype2]) <= (len(cdata.samples) - args.samp_cutoff)) and ( mtype1 & mtype2).is_empty()) } if args.verbose: print("\nFound {} non-overlapping sub-type pairs!".format( len(use_pairs))) # save the list of found non-duplicate sub-types to file pickle.dump( sorted(use_pairs), open( os.path.join( out_path, 'pairs_list__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) with open( os.path.join( out_path, 'pairs_count__samps_{}__levels_{}.txt'.format( args.samp_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(use_pairs)))
def main(): parser = argparse.ArgumentParser( "Set up the paired-gene subtype expression effect isolation " "experiment by enumerating the subtypes to be tested.") # create positional command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('mut_levels', type=str, help="the mutation property levels to consider") parser.add_argument('genes', type=str, nargs='+', help="a list of mutated genes") # create optional command line arguments parser.add_argument('--samp_cutoff', type=int, default=20, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found subtypes # will be stored args = parser.parse_args() use_lvls = args.mut_levels.split('__') out_path = os.path.join(base_dir, 'setup', args.cohort, '_'.join(args.genes)) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = syn_root syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=args.genes, mut_levels=['Gene'] + use_lvls, expr_source='Firehose', var_source='mc3', copy_source='Firehose', annot_file=annot_file, expr_dir=expr_dir, domain_dir=domain_dir, cv_prop=1.0, syn=syn) iso_mtypes = set() for gene in args.genes: other_samps = reduce(or_, [ cdata.train_mut[other_gn].get_samples() for other_gn in set(args.genes) - {gene} ]) if args.verbose: print("Looking for combinations of subtypes of mutations in gene " "{} present in at least {} of the samples in TCGA cohort " "{} at annotation levels {}.\n".format( gene, args.samp_cutoff, args.cohort, use_lvls)) pnt_mtypes = cdata.train_mut[gene]['Point'].find_unique_subtypes( max_types=500, max_combs=2, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) # filter out the subtypes that appear in too many samples for there to # be a wild-type class of sufficient size for classification pnt_mtypes = { MuType({('Scale', 'Point'): mtype}) for mtype in pnt_mtypes if (len(mtype.get_samples(cdata.train_mut[gene]['Point'])) <= ( len(cdata.samples) - args.samp_cutoff)) } pnt_mtypes |= {MuType({('Scale', 'Point'): None})} cna_mtypes = cdata.train_mut[gene]['Copy'].branchtypes( min_size=args.samp_cutoff) cna_mtypes |= {MuType({('Copy', ('HetGain', 'HomGain')): None})} cna_mtypes |= {MuType({('Copy', ('HetDel', 'HomDel')): None})} cna_mtypes = { MuType({('Scale', 'Copy'): mtype}) for mtype in cna_mtypes if (len(mtype.get_samples(cdata.train_mut[gene]['Copy'])) <= ( len(cdata.samples) - args.samp_cutoff)) } all_mtype = MuType(cdata.train_mut[gene].allkey()) use_mtypes = pnt_mtypes | cna_mtypes only_mtypes = { (MuType({('Gene', gene): mtype}), ) for mtype in use_mtypes if (len( mtype.get_samples(cdata.train_mut[gene]) - (all_mtype - mtype).get_samples(cdata.train_mut[gene]) - other_samps) >= args.samp_cutoff) } comb_mtypes = {(MuType({('Gene', gene): mtype1}), MuType({('Gene', gene): mtype2})) for mtype1, mtype2 in combn(use_mtypes, 2) if ((mtype1 & mtype2).is_empty() and ( len((mtype1.get_samples(cdata.train_mut[gene]) & mtype2.get_samples(cdata.train_mut[gene])) - (mtype1.get_samples(cdata.train_mut[gene]) ^ mtype2.get_samples(cdata.train_mut[gene])) - (all_mtype - mtype1 - mtype2).get_samples(cdata.train_mut[gene]) - other_samps) >= args.samp_cutoff))} iso_mtypes |= only_mtypes | comb_mtypes if args.verbose: print( "\nFound {} exclusive sub-types and {} combination sub-types " "to isolate!".format(len(only_mtypes), len(comb_mtypes))) for cur_genes in chain.from_iterable( combn(args.genes, r) for r in range(1, len(args.genes))): gene_mtype = MuType({('Gene', cur_genes): None}) rest_mtype = MuType({ ('Gene', tuple(set(args.genes) - set(cur_genes))): None }) if (args.samp_cutoff <= len( gene_mtype.get_samples(cdata.train_mut) - rest_mtype.get_samples(cdata.train_mut)) <= (len(cdata.samples) - args.samp_cutoff)): iso_mtypes |= {(gene_mtype, )} if args.verbose: print("\nFound {} total sub-types to isolate!".format(len(iso_mtypes))) # save the list of found non-duplicate sub-types to file pickle.dump( sorted(iso_mtypes), open( os.path.join( out_path, 'mtypes_list__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) with open( os.path.join( out_path, 'mtypes_count__samps_{}__levels_{}.txt'.format( args.samp_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(iso_mtypes)))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description=("Test a classifier's ability to predict the presence " "of a list of sub-types.")) # positional command line arguments for where input data and output # data is to be stored parser.add_argument('mtype_file', type=str, help='the pickle file where sub-types are stored') parser.add_argument('out_dir', type=str, help='where to save the output of testing sub-types') # positional arguments for which cohort of samples and which mutation # classifier to use for testing parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') # positional arguments controlling CV and task selection parser.add_argument('cv_id', type=int, help='a random seed used for cross-validation') parser.add_argument('task_id', type=int, help='the subset of sub-types to assign to this task') parser.add_argument( '--task_count', type=int, default=10, help='how many parallel tasks the list of types to test is split into') # optional arguments controlling how classifier tuning is to be performed parser.add_argument( '--tune_splits', type=int, default=4, help='how many training cohort splits to use for tuning') parser.add_argument( '--test_count', type=int, default=16, help='how many hyper-parameter values to test in each tuning split') parser.add_argument( '--parallel_jobs', type=int, default=8, help='how many parallel CPUs to allocate the tuning tests across') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') args = parser.parse_args() if args.verbose: print("Starting testing for sub-types in\n{}\nwith " "cross-validation ID {} and task ID {} ...".format( args.mtype_file, args.cv_id, args.task_id)) mtype_list = sorted(pickle.load(open(args.mtype_file, 'rb'))) out_file = os.path.join( args.out_dir, 'out__cv-{}_task-{}.p'.format(args.cv_id, args.task_id)) # loads the pipeline used for classifying variants, gets the mutated # genes for each variant under consideration mut_clf = eval(args.classif) use_genes = reduce( or_, [set(gn for gn, _ in mtype.subtype_list()) for mtype in mtype_list]) # logs into Synapse using locally-stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # loads the expression data and gene mutation data for the given TCGA # cohort, with the training/testing cohort split defined by the # cross-validation id for this task cdata = MutationCohort(cohort=args.cohort, mut_genes=list(use_genes), mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'], expr_source='Firehose', expr_dir=firehose_dir, syn=syn, cv_seed=(args.cv_id + 3) * 19, cv_prop=2.0 / 3) if args.verbose: print("Loaded {} sub-types over {} genes which will be tested using " "classifier {} in cohort {} with {} samples.".format( len(mtype_list), len(use_genes), args.classif, args.cohort, len(cdata.samples))) # initialize the dictionaries that will store classification # performances and hyper-parameter values out_acc = {mtype: -1 for mtype in mtype_list} out_par = {mtype: None for mtype in mtype_list} # for each sub-variant, check if it has been assigned to this task for i, mtype in enumerate(mtype_list): if (i % args.task_count) == args.task_id: if args.verbose: print("Testing {} ...".format(mtype)) # gets the genes that this variant mutates, initializes the # classification pipeline ex_genes = set(gn for gn, _ in mtype.subtype_list()) clf = mut_clf() # tunes the classifier using the training cohort clf.tune_coh(cdata, mtype, exclude_genes=ex_genes, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) out_par[mtype] = { par: clf.get_params()[par] for par, _ in clf.tune_priors } # fits the tuned classifier on the training cohort, evaluates its # performance on the testing cohort and saves the results clf.fit_coh(cdata, mtype, exclude_genes=ex_genes) out_acc[mtype] = clf.eval_coh(cdata, mtype, exclude_genes=ex_genes) else: del (out_acc[mtype]) del (out_par[mtype]) # saves the performance measurements and tuned hyper-parameter values # for each sub-type to file pickle.dump( { 'Acc': out_acc, 'Par': out_par, 'Info': { 'TuneSplits': args.tune_splits, 'TestCount': args.test_count, 'ParallelJobs': args.parallel_jobs } }, open(out_file, 'wb'))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( "Isolate the expression signatures of pairs of mutation subtypes " "against one another from their parent gene(s)' signature or that of " "a list of genes in a given TCGA cohort." ) # positional command line arguments for where input data and output # data is to be stored parser.add_argument('mtype_file', type=str, help='the pickle file where sub-types are stored') parser.add_argument('out_dir', type=str, help='where to save the output of testing sub-types') # positional arguments for which cohort of samples and which mutation # classifier to use for testing parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') parser.add_argument( '--cv_id', type=int, default=4309, help='the random seed to use for cross-validation draws' ) parser.add_argument( '--task_count', type=int, default=10, help='how many parallel tasks the list of types to test is split into' ) parser.add_argument('--task_id', type=int, default=0, help='the subset of subtypes to assign to this task') # optional arguments controlling how classifier tuning is to be performed parser.add_argument( '--tune_splits', type=int, default=4, help='how many training cohort splits to use for tuning' ) parser.add_argument( '--test_count', type=int, default=16, help='how many hyper-parameter values to test in each tuning split' ) parser.add_argument( '--infer_splits', type=int, default=20, help='how many cohort splits to use for inference bootstrapping' ) parser.add_argument( '--infer_folds', type=int, default=4, help=('how many parts to split the cohort into in each inference ' 'cross-validation run') ) parser.add_argument( '--parallel_jobs', type=int, default=4, help='how many parallel CPUs to allocate the tuning tests across' ) parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') args = parser.parse_args() out_file = os.path.join(args.out_dir, 'out__task-{}.p'.format(args.task_id)) pair_list = pickle.load(open(args.mtype_file, 'rb')) use_lvls = [] for lvls in reduce(or_, [{(mtype1 | mtype2).get_sorted_levels()} for mtype1, mtype2 in pair_list]): for lvl in lvls: if lvl not in use_lvls: use_lvls.append(lvl) if args.verbose: print("Starting paired isolation for sub-types in\n{}\n at " "annotation levels {}, the results of which will be stored " "in\n{}\nin cohort {} with classifier <{}>.".format( args.mtype_file, use_lvls, args.out_dir, args.cohort, args.classif )) use_genes = reduce(or_, [(set(gn for gn, _ in mtype1.subtype_list()) | set(gn for gn, _ in mtype2.subtype_list())) for mtype1, mtype2 in pair_list]) if args.classif[:6] == 'Stan__': use_module = import_module('HetMan.experiments.utilities' '.stan_models.{}'.format( args.classif.split('Stan__')[1])) mut_clf = getattr(use_module, 'UsePipe') else: mut_clf = eval(args.classif) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # loads the expression data and gene mutation data for the given TCGA # cohort, with the training/testing cohort split defined by the # cross-validation id for this task cdata = MutationCohort( cohort=args.cohort, mut_genes=list(use_genes), mut_levels=use_lvls, expr_source='Firehose', expr_dir=firehose_dir, syn=syn, cv_seed=9999, cv_prop=1.0 ) if args.verbose: print("Loaded {} pairs of subtypes of which roughly {} will be " "isolated in cohort {} with {} samples.".format( len(pair_list), len(pair_list) // args.task_count, args.cohort, len(cdata.samples) )) out_cross = {(mtype1, mtype2): None for mtype1, mtype2 in pair_list} out_cross.update({(mtype2, mtype1): None for mtype1, mtype2 in pair_list}) # for each subtype, check if it has been assigned to this task for i, (mtype1, mtype2) in enumerate(pair_list): if (i % args.task_count) == args.task_id: clf = mut_clf() if args.verbose: print("Pairing {} and {} ...".format(mtype1, mtype2)) samps1 = mtype1.get_samples(cdata.train_mut) samps2 = mtype2.get_samples(cdata.train_mut) ex_genes = set(gn for gn, _ in mtype1.subtype_list()) ex_genes |= set(gn for gn, _ in mtype2.subtype_list()) if len(samps1 | samps2) <= (len(cdata.samples) - 10): if 10 <= len(samps1 - samps2): clf.tune_coh(cdata, mtype1, exclude_genes=ex_genes, exclude_samps=samps2, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) out_cross[(mtype1, mtype2)] = clf.infer_coh( cdata, mtype1, exclude_genes=ex_genes, force_test_samps=samps2, infer_splits=args.infer_splits, infer_folds=args.infer_folds, parallel_jobs=args.parallel_jobs ) if 10 <= len(samps2 - samps1): clf.tune_coh(cdata, mtype2, exclude_genes=ex_genes, exclude_samps=samps1, tune_splits=args.tune_splits, test_count=args.test_count, parallel_jobs=args.parallel_jobs) out_cross[(mtype2, mtype1)] = clf.infer_coh( cdata, mtype2, exclude_genes=ex_genes, force_test_samps=samps1, infer_splits=args.infer_splits, infer_folds=args.infer_folds, parallel_jobs=args.parallel_jobs ) else: del(out_cross[(mtype1, mtype2)]) del(out_cross[(mtype2, mtype1)]) pickle.dump( {'Infer': out_cross, 'Info': {'TunePriors': mut_clf.tune_priors, 'TuneSplits': args.tune_splits, 'TestCount': args.test_count}}, open(out_file, 'wb') )
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description='Set up touring for sub-types to detect.') parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('gene1', type=str, help="which gene to consider") parser.add_argument('gene2', type=str, help="which gene to consider") parser.add_argument( 'mut_levels', type=str, help='the mutation property levels to consider, in addition to `Gene`') parser.add_argument('--samp_cutoff', type=int, default=20, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse the command line arguments, get the directory where found sub-types # will be saved for future use args = parser.parse_args() out_path = os.path.join(base_dir, 'setup', args.cohort, '{}_{}'.format(args.gene1, args.gene2)) os.makedirs(out_path, exist_ok=True) use_lvls = args.mut_levels.split('__') # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene1, args.gene2], mut_levels=['Gene'] + use_lvls, expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, cv_prop=1.0, syn=syn) cross_mtypes1 = cdata.train_mut[args.gene1].find_unique_subtypes( max_types=40, max_combs=50, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) cross_mtypes2 = cdata.train_mut[args.gene2].find_unique_subtypes( max_types=40, max_combs=50, verbose=2, sub_levels=use_lvls, min_type_size=args.samp_cutoff) if args.verbose: print("Found {} sub-types of {} and {} sub-types of {} " "to cross!".format(len(cross_mtypes1), args.gene1, len(cross_mtypes2), args.gene2)) cross_mtypes1 = { MuType({('Gene', args.gene1): mtype}) for mtype in cross_mtypes1 } cross_mtypes2 = { MuType({('Gene', args.gene2): mtype}) for mtype in cross_mtypes2 } samps1 = { mtype: mtype.get_samples(cdata.train_mut) for mtype in cross_mtypes1 } samps2 = { mtype: mtype.get_samples(cdata.train_mut) for mtype in cross_mtypes2 } use_pairs = sorted( (mtype1, mtype2) for mtype1, mtype2 in product(cross_mtypes1, cross_mtypes2) if (len(samps1[mtype1] - samps2[mtype2]) >= args.samp_cutoff and len(samps2[mtype2] - samps1[mtype1]) >= args.samp_cutoff)) if args.verbose: print("\nSaving {} pairs with sufficient " "exclusivity...".format(len(use_pairs))) pickle.dump( use_pairs, open( os.path.join( out_path, 'pairs_list__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) pickle.dump( {(mtype1, mtype2): cdata.mutex_test(mtype1, mtype2) for mtype1, mtype2 in use_pairs}, open( os.path.join( out_path, 'pairs_mutex__samps_{}__levels_{}.p'.format( args.samp_cutoff, args.mut_levels)), 'wb')) pickle.dump({'Samps': cdata.samples}, open(os.path.join(out_path, 'cohort_info.p'), 'wb')) with open( os.path.join( out_path, 'pairs_count__samps_{}__levels_{}.txt'.format( args.samp_cutoff, args.mut_levels)), 'w') as fl: fl.write(str(len(use_pairs)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('model_name', type=str, help='the name of a Stan model') parser.add_argument( 'solve_method', type=str, help='the method used for optimizing the parameters of the Stan model' ) parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('gene', type=str, help='a gene with mutated samples') parser.add_argument('cv_id', type=int, help='a random seed used for cross-validation') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') args = parser.parse_args() out_path = os.path.join(base_dir, 'output', args.model_name, args.solve_method, args.cohort, args.gene) if args.verbose: print("Starting distribution testing for Stan model {} using " "optimization method {} on mutated gene {} in TCGA cohort {} " "for cross-validation ID {} ...".format( args.model_name, args.solve_method, args.cohort, args.gene, args.cv_id )) use_mtype = MuType({('Gene', args.gene): None}) use_module = import_module('HetMan.experiments.stan_test' '.distr.models.{}'.format(args.model_name)) UsePipe = getattr(use_module, 'UsePipe') if args.solve_method == 'optim': clf_stan = getattr(use_module, 'UsePipe')( getattr(use_module, 'UseOptimizing')( model_code=getattr(use_module, 'use_model')) ) elif args.solve_method == 'variat': clf_stan = getattr(use_module, 'UsePipe')( getattr(use_module, 'UseVariational')( model_code=getattr(use_module, 'use_model')) ) elif args.solve_method == 'sampl': clf_stan = getattr(use_module, 'UsePipe')( getattr(use_module, 'UseSampling')( model_code=getattr(use_module, 'use_model')) ) else: raise ValueError("Unrecognized <solve_method> argument!") if '_' in args.gene: mut_info = args.gene.split('_') use_mtype = MuType({('Gene', mut_info[0]): mtype_list[mut_info[1]]}) else: use_mtype = MuType({('Gene', args.gene): None}) clf_stan = eval("model_dict['{}']".format(args.model_name)) cdata = MutationCohort( cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3', syn=syn, cv_prop=1.0, cv_seed=1298 + 93 * args.cv_id ) clf_stan.tune_coh(cdata, use_mtype, exclude_genes={args.gene}, tune_splits=4, test_count=24, parallel_jobs=12) clf_stan.fit_coh(cdata, use_mtype, exclude_genes={args.gene}) if clf_stan.tune_priors: clf_params = clf_stan.get_params() else: clf_params = None infer_mat = clf_stan.infer_coh( cdata, use_mtype, exclude_genes={args.gene}, infer_splits=12, infer_folds=4, parallel_jobs=12 ) pickle.dump( {'Params': clf_params, 'Infer': infer_mat, 'Vars': clf_stan.named_steps['fit'].get_var_means()}, open(os.path.join(out_path, 'out__cv-{}.p'.format(args.cv_id)), 'wb') )
def main(): parser = argparse.ArgumentParser( "Set up the copy number alteration expression effect isolation " "experiment by enumerating alteration score thresholds to be tested.") # create command line arguments parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('gene', type=str, help="which gene to consider") parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found thresholds # and threshold counts will be stored args = parser.parse_args() os.makedirs(os.path.join(base_dir, 'setup', 'ctf_lists'), exist_ok=True) os.makedirs(os.path.join(base_dir, 'setup', 'ctf_counts'), exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # load expression, variant call, and copy number alteration data for # the given TCGA cohort and mutated gene cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'], expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, copy_source='Firehose', copy_dir=copy_dir, copy_discrete=False, cv_prop=1.0, syn=syn) ctf_list = [] mut_stat = np.array(cdata.train_mut.status(cdata.copy_data.index)) mut_pheno = np.array(cdata.train_pheno(MuType({('Gene', args.gene): None}))) copy_vals = cdata.copy_data.loc[~mut_stat, args.gene] loss_vals = copy_vals[copy_vals < 0] gain_vals = copy_vals[copy_vals > 0] loss_step = 20 / len(loss_vals) loss_ctfs = np.unique( loss_vals.quantile(np.arange(loss_step, 1, loss_step))) gain_step = 20 / len(gain_vals) gain_ctfs = np.unique( gain_vals.quantile(np.arange(gain_step, 1, gain_step)))[::-1] for low_ctf, high_ctf in combn(loss_ctfs, 2): cna_stat = (~mut_pheno & cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Loss', 'Cutoff': low_ctf })) wt_stat = (~mut_pheno & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Range', 'Cutoff': (low_ctf, high_ctf) }) & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Gain', 'Cutoff': -high_ctf })) if (np.sum(cna_stat) >= 20) & (np.sum(wt_stat) >= 20): ctf_list += [(low_ctf, high_ctf)] for high_ctf, low_ctf in combn(gain_ctfs, 2): cna_stat = (~mut_pheno & cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Gain', 'Cutoff': high_ctf })) wt_stat = (~mut_pheno & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Range', 'Cutoff': (low_ctf, high_ctf) }) & ~cdata.train_pheno({ 'Gene': args.gene, 'CNA': 'Loss', 'Cutoff': -low_ctf })) if (np.sum(cna_stat) >= 20) & (np.sum(wt_stat) >= 20): ctf_list += [(low_ctf, high_ctf)] # save the list of found non-duplicate subtypes to file pickle.dump( sorted(ctf_list), open( os.path.join(base_dir, 'setup', 'ctf_lists', '{}_{}.p'.format(args.cohort, args.gene)), 'wb')) with open( os.path.join(base_dir, 'setup', 'ctf_counts', '{}_{}.txt'.format(args.cohort, args.gene)), 'w') as fl: fl.write(str(len(ctf_list)))
def main(): parser = argparse.ArgumentParser( "Set up the paired gene expression effect isolation experiment by " "enumerating the dyads of genes to be tested.") parser.add_argument('cohort', type=str, help="which TCGA cohort to use") parser.add_argument('--samp_cutoff', type=int, default=40, help='subtype sample frequency threshold') parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse command line arguments, create directory where found pairs # will be stored args = parser.parse_args() out_path = os.path.join(base_dir, 'setup', args.cohort) os.makedirs(out_path, exist_ok=True) # log into Synapse using locally stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() cdata = MutationCohort(cohort=args.cohort, mut_genes=None, mut_levels=['Gene'], expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir, samp_cutoff=args.samp_cutoff, cv_prop=1.0, syn=syn) if args.verbose: print("Looking for pairs of mutated genes present in at least {} of " "the samples in TCGA cohort {} with {} total samples.".format( args.samp_cutoff, args.cohort, len(cdata.samples))) gene_pairs = { (MuType({('Gene', gn1): None}), MuType({('Gene', gn2): None})) for (gn1, muts1), (gn2, muts2) in combn(cdata.train_mut, r=2) if (len(muts1 - muts2) >= args.samp_cutoff and len(muts2 - muts1) >= args.samp_cutoff and len(muts1 | muts2) <= (len(cdata.samples) - args.samp_cutoff)) } if args.verbose: print("Found {} pairs of genes to isolate!".format(len(gene_pairs))) pickle.dump( sorted(gene_pairs), open( os.path.join(out_path, 'pairs_list__samps_{}.p'.format(args.samp_cutoff)), 'wb')) with open( os.path.join(out_path, 'pairs_count__samps_{}.txt'.format(args.samp_cutoff)), 'w') as fl: fl.write(str(len(gene_pairs)))