Пример #1
0
def get_cohort_data(cohort, use_gene, cv_seed=None, test_prop=0):
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    if cohort == 'beatAML':
        cdata = BeatAmlCohort(mut_levels=['Form', 'Exon', 'Protein'],
                              mut_genes=[use_gene],
                              expr_file=beatAML_files['expr'],
                              samp_file=beatAML_files['samps'],
                              syn=syn,
                              annot_file=annot_file,
                              cv_seed=cv_seed,
                              test_prop=test_prop)

    else:
        cdata = MutationCohort(cohort=cohort.split('_')[0],
                               mut_levels=['Form_base', 'Protein'],
                               mut_genes=[use_gene],
                               expr_source='Firehose',
                               var_source='mc3',
                               copy_source='Firehose',
                               annot_file=annot_file,
                               type_file=type_file,
                               expr_dir=expr_dir,
                               copy_dir=copy_dir,
                               syn=syn,
                               cv_seed=cv_seed,
                               test_prop=test_prop,
                               annot_fields=['transcript'],
                               use_types=parse_subtypes(cohort))

    return cdata
Пример #2
0
def get_cohort_data(expr_source,
                    cohort,
                    samp_cutoff,
                    cv_prop=1.0,
                    cv_seed=None):
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0)
    use_genes = gene_df.index[(
        gene_df.
        loc[:, ['Vogelstein', 'Sanger CGC', 'Foundation One', 'MSK-IMPACT']] ==
        'Yes').sum(axis=1) == 4]

    source_info = expr_source.split('__')
    source_base = source_info[0]
    collapse_txs = not (len(source_info) > 1 and source_info[1] == 'txs')

    cdata = MutationCohort(cohort=cohort,
                           mut_genes=use_genes.tolist(),
                           mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'],
                           expr_source=source_base,
                           var_source='mc3',
                           copy_source='Firehose',
                           annot_file=annot_file,
                           expr_dir=expr_sources[expr_source],
                           copy_dir=copy_dir,
                           collapse_txs=collapse_txs,
                           syn=syn,
                           cv_prop=cv_prop,
                           cv_seed=cv_seed)

    return cdata
Пример #3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('transform', type=str)
    parser.add_argument('cohort', type=str, help='a cohort in TCGA')
    parser.add_argument('gene', type=str)

    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=['Gene'],
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           cv_prop=1.0,
                           syn=syn)

    tune_params = (('fit__n_neighbors', (5, 10, 15)),
                   ('fit__metric', ('euclidean', 'correlation', 'cosine',
                                    'manhattan', 'chebyshev')), ('lbl',
                                                                 'base2'))
    #tune_params = (('fit__learning_rate', (50, 200, 750)),
    #               ('fit__perplexity', (5, 15, 30, 40, 50)),
    #               ('lbl', 'base'))

    plot_tuning_gene(cdata, args, tune_params)
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the success of classifying a gene's CNA status in a given "
        "cohort using different cutoffs for determining CNA status."
        )

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('gene', help='a mutated gene')
    parser.add_argument('classif', help='a mutation classifier')

    # parse command-line arguments, create directory where plots will be saved
    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'],
        expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir,
        copy_source='Firehose', copy_dir=copy_dir, copy_discrete=False,
        syn=syn, cv_prop=1.0
        )

    loss_df, gain_df = get_aucs(
        load_infer_output(os.path.join(base_dir, 'output',
                                       args.cohort, args.gene, args.classif)),
        args, cdata
        )

    plot_cutoff_aucs(loss_df, gain_df, args, cdata)
Пример #5
0
def get_cohort_data(syn,
                    expr_source,
                    cohort,
                    samp_cutoff,
                    cv_prop=1.0,
                    cv_seed=None):

    gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0)
    use_genes = gene_df.index[(
        gene_df.
        loc[:, ['Vogelstein', 'Sanger CGC', 'Foundation One', 'MSK-IMPACT']] ==
        'Yes').all(axis=1)]

    cdata = MutationCohort(cohort=cohort,
                           mut_genes=use_genes.tolist(),
                           mut_levels=['Gene', 'Form_base', 'Protein'],
                           expr_source=expr_source,
                           var_source='mc3',
                           copy_source='Firehose',
                           annot_file=annot_file,
                           expr_dir=expr_sources[expr_source],
                           copy_dir=copy_dir,
                           syn=syn,
                           cv_prop=cv_prop,
                           cv_seed=cv_seed)

    return cdata
Пример #6
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the ordering of the subtypes of a module of genes in a given "
        "cohort based on how their isolated expression signatures classify "
        "one another.")

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('classif', help='a mutation classifier')
    parser.add_argument('mut_levels',
                        type=str,
                        help='a set of mutation annotation levels')
    parser.add_argument('genes',
                        type=str,
                        nargs='+',
                        help='a list of mutated genes')
    parser.add_argument('--samp_cutoff', type=int, default=20)

    # parse command-line arguments, create directory where plots will be saved
    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=args.genes,
                           mut_levels=['Gene'] + args.mut_levels.split('__'),
                           expr_source='Firehose',
                           expr_dir=expr_dir,
                           var_source='mc3',
                           copy_source='Firehose',
                           domain_dir=domain_dir,
                           annot_file=annot_file,
                           syn=syn,
                           cv_prop=1.0)

    pheno_dict, auc_list, simil_df = compare_scores(
        load_infer_output(
            os.path.join(base_dir, 'output', args.cohort,
                         '_'.join(sorted(args.genes)), args.classif,
                         'samps_{}'.format(args.samp_cutoff),
                         args.mut_levels)), cdata)

    simil_rank = simil_df.mean(axis=1) - simil_df.mean(axis=0)
    simil_order = [
        mtypes for mtypes, _ in sorted(tuple(simil_rank.iteritems()),
                                       key=lambda k:
                                       (k[0][0].subtype_list()[0][0], k[1]))
    ]

    simil_df = simil_df.loc[simil_order, simil_order[::-1]]
    plot_singleton_ordering(simil_df.copy(), auc_list.copy(),
                            pheno_dict.copy(), args)
    plot_singleton_clustering(simil_df.copy(), auc_list.copy(),
                              pheno_dict.copy(), args)
    plot_all_clustering(simil_df.copy(), auc_list.copy(), args)
Пример #7
0
def get_cohorts(expr_source, cohorts, mut_levels, cv_prop=1.0, cv_seed=9078):
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0)
    use_genes = gene_df.index[(gene_df.loc[:, [
        'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT'
    ]] == 'Yes').sum(axis=1) >= 1]

    source_info = expr_source.split('__')
    source_base = source_info[0]
    collapse_txs = not (len(source_info) > 1 and source_info[1] == 'txs')
    cohorts_base = {cohort: cohort.split('_')[0] for cohort in cohorts}

    cdata_dict = {
        cohort: MutationCohort(cohort=cohorts_base[cohort],
                               mut_genes=use_genes.tolist(),
                               mut_levels=['Gene'] + mut_levels,
                               expr_source=source_base,
                               var_source='mc3',
                               copy_source='Firehose',
                               annot_file=annot_file,
                               type_file=type_file,
                               expr_dir=expr_sources[expr_source],
                               copy_dir=copy_dir,
                               collapse_txs=collapse_txs,
                               syn=syn,
                               cv_prop=cv_prop,
                               cv_seed=cv_seed,
                               annot_fields=['transcript'],
                               use_types=parse_subtypes(cohort))
        for cohort in cohorts
    }

    cdata = MutationConcatCohort(cohorts=list(cohorts_base.values()),
                                 mut_genes=use_genes.tolist(),
                                 mut_levels=['Gene'] + mut_levels,
                                 expr_source=source_base,
                                 var_source='mc3',
                                 copy_source='Firehose',
                                 annot_file=annot_file,
                                 type_file=type_file,
                                 expr_dir=expr_sources[expr_source],
                                 copy_dir=copy_dir,
                                 collapse_txs=collapse_txs,
                                 syn=syn,
                                 cv_prop=cv_prop,
                                 cv_seed=cv_seed,
                                 annot_fields=['transcript'],
                                 use_types={
                                     cohorts_base[cohort]:
                                     parse_subtypes(cohort)
                                     for cohort in cohorts
                                 })

    return cdata, cdata_dict
Пример #8
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the distributions of perturbation scores separated by mutation "
        "subtype status as inferred by a Stan mutation classifier trained on "
        "a gene in a given TCGA cohort."
        )

    # positional command-line arguments regarding the Stan model used to
    # obtain the sample mutation scores
    parser.add_argument('model_name', type=str, help="label of a Stan model")
    parser.add_argument('solve_method', type=str,
                        help=("method used to obtain estimates for the "
                              "parameters of the model"))

    # positional command line arguments regarding the samples and the mutation
    # classification task on which the model was trained
    parser.add_argument('cohort', type=str, help="a TCGA cohort")
    parser.add_argument('gene', type=str, help="a mutated gene")

    parser.add_argument('mut_levels', nargs='*',
                        default=['Form_base', 'Exon'],
                        help="which mutation annotation levels to consider")

    # parse command line arguments, ensure directory where plots will be saved
    # exists, load inferred mutation scores from each cross-validation run
    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)
    infer_mat = load_output(args.model_name, args.solve_method,
                            args.cohort, args.gene)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ('/home/exacloud/lustre1/CompBio'
                                '/mgrzad/input-data/synapse')
    syn.login()

    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=[args.gene], mut_levels=args.mut_levels,
        expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3',
        syn=syn, cv_prop=1.0
        )

    for use_levels in chain.from_iterable(
            combinations(args.mut_levels, r)
            for r in range(1, len(args.mut_levels) + 1)
            ):

        plot_subtype_violins(infer_mat, args, cdata, use_levels)
        plot_subtype_stability(infer_mat, args, cdata, use_levels)
Пример #9
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the ordering of a gene's subtypes in a given cohort based on "
        "how their isolated expression signatures classify one another.")

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('classif', help='a mutation classifier')
    parser.add_argument('mut_levels',
                        type=str,
                        help='a set of mutation annotation levels')
    parser.add_argument('genes',
                        type=str,
                        nargs='+',
                        help='a list of mutated genes')
    parser.add_argument('--samp_cutoff', type=int, default=25)

    # parse command-line arguments, create directory where plots will be saved
    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=args.genes,
                           mut_levels=['Gene'] + args.mut_levels.split('__'),
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           syn=syn,
                           cv_prop=1.0)

    simil_df, auc_list = get_similarities(
        load_infer_output(
            os.path.join(base_dir, 'output', args.cohort,
                         '_'.join(sorted(args.genes)), args.classif,
                         'samps_{}'.format(args.samp_cutoff),
                         args.mut_levels)), args.genes, cdata)
    print(simil_df.shape)

    simil_rank = simil_df.mean(axis=1) - simil_df.mean(axis=0)
    simil_order = simil_rank.sort_values().index
    simil_df = simil_df.loc[simil_order, reversed(simil_order)]

    plot_singleton_ordering(simil_df.copy(), auc_list.copy(), args, cdata)
    plot_all_ordering(simil_df.copy(), auc_list.copy(), args, cdata)
Пример #10
0
def get_cohort_data(cohort, expr_source, cv_seed=None):
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0)
    use_genes = gene_df.index[(gene_df.loc[:, [
        'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT'
    ]] == 'Yes').sum(axis=1) > 1]

    if cohort == 'beatAML':
        if expr_source != 'toil__gns':
            raise ValueError("Only gene-level Kallisto calls are available "
                             "for the beatAML cohort!")

        cdata = BeatAmlCohort(mut_levels=['Gene', 'Form_base', 'Protein'],
                              mut_genes=use_genes.tolist(),
                              expr_source=expr_source,
                              expr_file=beatAML_files['expr'],
                              samp_file=beatAML_files['samps'],
                              syn=syn,
                              annot_file=annot_file,
                              cv_seed=cv_seed,
                              test_prop=0)

    else:
        source_info = expr_source.split('__')
        source_base = source_info[0]
        collapse_txs = not (len(source_info) > 1 and source_info[1] == 'txs')

        cdata = MutationCohort(cohort=cohort.split('_')[0],
                               mut_levels=['Gene', 'Form_base', 'Protein'],
                               mut_genes=use_genes.tolist(),
                               expr_source=source_base,
                               var_source='mc3',
                               copy_source='Firehose',
                               annot_file=annot_file,
                               type_file=type_file,
                               expr_dir=expr_sources[source_base],
                               copy_dir=copy_dir,
                               collapse_txs=collapse_txs,
                               syn=syn,
                               cv_seed=cv_seed,
                               test_prop=0,
                               annot_fields=['transcript'],
                               use_types=parse_subtypes(cohort))

    return cdata
Пример #11
0
def get_cohort_data(expr_source, syn_root, cohort, samp_cutoff):
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    expr_dir = pd.read_csv(
        open(os.path.join(base_dir, 'expr_sources.txt'), 'r'),
        sep='\t', header=None, index_col=0
        ).loc[expr_source].iloc[0]
 
    cdata = MutationCohort(
        cohort=cohort, mut_genes=None, mut_levels=['Gene'], cv_prop=1.0,
        expr_source=expr_source, expr_dir=expr_dir, var_source='mc3',
        syn=syn, samp_cutoff=samp_cutoff
        )

    return cdata
Пример #12
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the distribution of labels by mutation subtype returned by a "
        "Stan classifier trained to predict all the mutations for a given "
        "gene in a TCGA cohort.")

    parser.add_argument('model_name', type=str, help="label of a Stan model")
    parser.add_argument('solve_method',
                        type=str,
                        help=("method used to obtain estimates for the "
                              "parameters of the model"))

    parser.add_argument('cohort', type=str, help="a TCGA cohort")
    parser.add_argument('gene', type=str, help="a mutated gene")

    parser.add_argument('mut_levels',
                        nargs='*',
                        default=['Form_base', 'Exon'],
                        help="which mutation annotation levels to consider")

    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)
    infer_mat = load_output(args.model_name, args.solve_method, args.cohort,
                            args.gene)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ('/home/exacloud/lustre1/CompBio'
                                '/mgrzad/input-data/synapse')
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=args.mut_levels,
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           var_source='mc3',
                           syn=syn,
                           cv_prop=1.0)

    for use_levels in chain.from_iterable(
            combinations(args.mut_levels, r)
            for r in range(1,
                           len(args.mut_levels) + 1)):
        plot_subtype_expression(infer_mat, args, cdata, use_levels)
Пример #13
0
def get_cohort_data(cohort):
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0)
    use_genes = gene_df.index[(gene_df.loc[:, [
        'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT'
    ]] == 'Yes').sum(axis=1) >= 1]

    if cohort == 'beatAML':
        cdata = BeatAmlCohort(mut_levels=[
            'Gene', 'Form_base', 'Form', 'Exon', 'Location', 'Protein'
        ],
                              mut_genes=use_genes.tolist(),
                              expr_source='toil__gns',
                              expr_file=beatAML_files['expr'],
                              samp_file=beatAML_files['samps'],
                              syn=syn,
                              annot_file=annot_file,
                              cv_seed=671,
                              test_prop=0)

    else:
        cdata = MutationCohort(cohort=cohort.split('_')[0],
                               mut_levels=[
                                   'Gene', 'Form_base', 'Form', 'Exon',
                                   'Location', 'Protein'
                               ],
                               mut_genes=use_genes.tolist(),
                               expr_source='Firehose',
                               var_source='mc3',
                               copy_source='Firehose',
                               annot_file=annot_file,
                               type_file=type_file,
                               expr_dir=expr_dir,
                               copy_dir=copy_dir,
                               syn=syn,
                               cv_seed=671,
                               test_prop=0,
                               annot_fields=['transcript'],
                               use_types=parse_subtypes(cohort))

    return cdata
Пример #14
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the positions predicted for each sample in a given cohort by a "
        "multi-task model trained on pairs of mutation subtypes of a gene in "
        "two-dimensional inferred label space."
        )

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('gene', help='a mutated gene')
    parser.add_argument('mut_levels',
                        help='a set of mutation annotation levels')

    parser.add_argument('model_name', help='a Stan multi-task learning model')
    parser.add_argument('solve_method', choices=['optim', 'variat', 'sampl'],
                        help='method used to obtain Stan parameter estimates')

    # parse command-line arguments, create directory where plots will be saved
    args = parser.parse_args()
    os.makedirs(
        os.path.join(plot_dir, args.cohort, args.gene, args.mut_levels),
        exist_ok=True
        )

    multi_df = load_infer_output(os.path.join(
        base_dir, 'output', args.cohort, args.gene, args.mut_levels,
        args.model_name, args.solve_method
        ))

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort, mut_genes=[args.gene],
                           mut_levels=['Gene'] + args.mut_levels.split('__'),
                           expr_source='Firehose', expr_dir=firehose_dir,
                           syn=syn, cv_prop=1.0)

    for (mtype1, mtype2), infer_vals in multi_df.iterrows():
        plot_position(infer_vals, args, cdata, mtype1, mtype2)
Пример #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot experiment results for given mutation classifier.')

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('gene', help='a mutated gene')
    parser.add_argument('classif', help='a mutation classifier')
    parser.add_argument('mut_levels', default='Form_base__Exon')
    parser.add_argument('--samp_cutoff', default=20)

    # parse command-line arguments, create directory where plots will be saved
    args = parser.parse_args()
    os.makedirs(os.path.join(plot_dir, args.cohort, args.gene), exist_ok=True)

    prob_df = load_infer_output(
        os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif,
                     'samps_{}'.format(args.samp_cutoff),
                     args.mut_levels)).applymap(np.mean)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=None,
                           samp_cutoff=20,
                           mut_levels=['Gene'] + args.mut_levels.split('__'),
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           syn=syn,
                           cv_prop=1.0)

    singl_mtypes = [
        mtype for mtype in prob_df.index if len(mtype.subkeys()) == 1
    ]

    for singl_mtype in singl_mtypes:
        plot_mtype_positions(prob_df.loc[singl_mtype, :], args, cdata)
Пример #16
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the inferred CNA scores for a cohort's samples against their "
        "actual CNA scores for a given set of cutoffs.")

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('gene', help='a mutated gene')
    parser.add_argument('classif', help='a mutation classifier')

    # parse command-line arguments, create directory where plots will be saved
    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=['Gene'],
                           expr_source='Firehose',
                           var_source='mc3',
                           expr_dir=firehose_dir,
                           copy_source='Firehose',
                           copy_dir=copy_dir,
                           copy_discrete=False,
                           syn=syn,
                           cv_prop=1.0)

    iso_df = load_infer_output(
        os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif))

    loss_df, gain_df = get_aucs(iso_df, args, cdata)
    plot_cna_scores(iso_df.loc[loss_df['CNA'].idxmax(), :], args, cdata)
    plot_cna_scores(iso_df.loc[gain_df['CNA'].idxmax(), :], args, cdata)

    plot_cna_scores(iso_df.loc[(loss_df['CNA'] - loss_df['Mut']).idxmax(), :],
                    args, cdata)
Пример #17
0
def main():
    parser = argparse.ArgumentParser(
        "Plot the distributions of gene weight coefficients inferred by a "
        "given Stan classifier trained to predict the mutation status of a "
        "gene in a given TCGA cohort."
        )

    parser.add_argument('model_name', type=str, help="label of a Stan model")
    parser.add_argument('solve_method', type=str,
                        help=("method used to obtain estimates for the "
                              "parameters of the model"))

    parser.add_argument('cohort', type=str, help="a TCGA cohort")
    parser.add_argument('gene', type=str, help="a mutated gene")

    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)
    vars_dict = load_vars(args.model_name, args.solve_method,
                          args.cohort, args.gene)

    if 'gn_wghts' not in vars_dict:
        raise ValueError("Can only plot inferred gene weights for a model "
                         "that includes them as variables!")

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ('/home/exacloud/lustre1/CompBio'
                                '/mgrzad/input-data/synapse')
    syn.login()

    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'],
        expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3',
        syn=syn, cv_prop=1.0
        )

    wghts_df = pd.DataFrame(vars_dict['gn_wghts'],
                            index=sorted(cdata.genes - {args.gene}))
    plot_weights_cov(wghts_df, args, cdata)
Пример #18
0
def main():
    parser = argparse.ArgumentParser(
        "Plots the clustering done by an unsupervised learning method on a "
        "TCGA cohort with subtypes of particular genes highlighted.")

    parser.add_argument('cohort', type=str, help='a cohort in TCGA')
    parser.add_argument('transform',
                        type=str,
                        help='an unsupervised learning method')
    parser.add_argument('mut_levels',
                        type=str,
                        help='a set of mutation annotation levels')
    parser.add_argument('--genes',
                        type=str,
                        nargs='+',
                        default=['TP53'],
                        help='a list of mutated genes')

    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=args.genes,
                           mut_levels=['Gene'] + args.mut_levels.split('__'),
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           cv_prop=1.0,
                           syn=syn)

    mut_trans = eval(args.transform)()
    trans_expr = mut_trans.fit_transform_coh(cdata)

    for gene in args.genes:
        plot_subtype_clustering(trans_expr.copy(), args, cdata, gene)
Пример #19
0
def main():
    parser = argparse.ArgumentParser(
        "Plot how well expression signatures separate isolated mutation "
        "subtypes from non-mutated samples relative to how they separate "
        "mutated samples not belonging to the subtype.")

    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('gene', help='a mutated gene')
    parser.add_argument('classif', help='a mutation classifier')

    parser.add_argument('mut_levels',
                        default='Form_base__Exon',
                        help='a set of mutation annotation levels')
    parser.add_argument('--samp_cutoff', type=int, default=20)

    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=args.mut_levels.split('__'),
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           syn=syn,
                           cv_prop=1.0)

    infer_df = load_infer_output(
        os.path.join(base_dir, 'output', args.cohort, args.gene, args.classif,
                     'samps_{}'.format(args.samp_cutoff), args.mut_levels))
    auc_vals, sep_vals, prop_vals = get_separation(infer_df, args, cdata)

    plot_separation(auc_vals, sep_vals, prop_vals, args, cdata)
Пример #20
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up touring for sub-types to detect.'
        )
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")

    # optional command line arguments controlling the thresholds for which
    # individual mutations and how many genes' mutations are considered
    parser.add_argument('--freq_cutoff', type=float, default=0.02,
                        help='subtype sample frequency threshold')

    # optional command line arguments for what kinds of mutation sub-types to
    # look for in terms of properties and number of mutations to combine
    parser.add_argument('--mut_levels', type=str, default='Gene',
                        help='the mutation property levels to consider')

    # optional command line argument controlling verbosity
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'setup', args.cohort)
    os.makedirs(out_path, exist_ok=True)
    use_lvls = args.mut_levels.split('__')

    # log into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()
    
    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=None, mut_levels=use_lvls,
        expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir,
        cv_prop=1.0, samp_cutoff=args.freq_cutoff, syn=syn
        )

    if args.verbose:
        print("Found {} candidate genes with mutations in at least "
              "{:.1f}% of the samples in TCGA cohort {}.\nLooking for "
              "subtypes of these genes that are combinations of up to two "
              "mutations at annotation levels {} ...\n".format(
                  len(tuple(cdata.train_mut)), args.freq_cutoff * 100,
                  args.cohort, use_lvls
                )
             )
    
    min_samps = args.freq_cutoff * len(cdata.samples)
    if use_lvls == ['Gene']:

        use_mtypes = {MuType({('Gene', gn): None})
                      for gn, mut in cdata.train_mut
                      if len(mut) >= min_samps}

    elif use_lvls[0] == 'Gene':
        use_lvls = use_lvls[1:]

        use_mtypes = set()
        use_sampsets = set()
        mtype_sampsets = dict()

        for gn, mut in cdata.train_mut:
            cur_mtypes = {
                MuType({('Gene', gn): mtype})
                for mtype in mut.combtypes(comb_sizes=(1, 2),
                                           sub_levels=use_lvls,
                                           min_type_size=min_samps)
                }

            # finds the samples belonging to each enumerated sub-type that
            # hasn't already been found
            cur_sampsets = {
                mtype: frozenset(mtype.get_samples(cdata.train_mut))
                for mtype in cur_mtypes - use_mtypes}

            # removes the sub-types with so many mutated samples that there
            # are not enough negatively-labelled samples for classification
            mtype_sampsets.update({
                mtype: sampset for mtype, sampset in cur_sampsets.items()
                if len(sampset) <= (len(cdata.samples) - min_samps)
                })

        # ensures that when two sub-types have the same samples the one
        # further down the sort order gets removed
        sub_mtypes = sorted(list(mtype_sampsets))
        if args.verbose:
            print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            for i, mtype in enumerate(sub_mtypes):

                if args.verbose and (i % 200) == 100:
                    print("\nchecked {} sub-types\n".format(i))

                # ...we remove each one whose set of mutated samples is
                # identical to that of a sub-type that was already found
                if mtype_sampsets[mtype] in use_sampsets:
                    if args.verbose:
                        print("Removing functionally duplicate MuType {}"\
                                .format(mtype))

                else:
                    use_mtypes.update({mtype})
                    use_sampsets.update({mtype_sampsets[mtype]})

    else:
        cur_mtypes = cdata.train_mut.combtypes(comb_sizes=(1, 2),
                                               sub_levels=use_lvls,
                                               min_type_size=min_samps)

        use_mtypes = set()
        use_sampsets = set()
        mtype_sampsets = dict()

        cur_sampsets = {mtype: frozenset(mtype.get_samples(cdata.train_mut))
                        for mtype in cur_mtypes - use_mtypes}

        # removes the sub-types with so many mutated samples that there
        # are not enough negatively-labelled samples for classification
        mtype_sampsets.update({
            mtype: sampset for mtype, sampset in cur_sampsets.items()
            if len(sampset) <= (len(cdata.samples) - min_samps)
            })

        # ensures that when two sub-types have the same samples the one
        # further down the sort order gets removed
        sub_mtypes = sorted(list(mtype_sampsets))
        if args.verbose:
            print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            for i, mtype in enumerate(sub_mtypes):

                if args.verbose and (i % 200) == 100:
                    print("\nchecked {} sub-types\n".format(i))

                # ...we remove each one whose set of mutated samples is
                # identical to that of a sub-type that was already found
                if mtype_sampsets[mtype] in use_sampsets:
                    if args.verbose:
                        print("Removing functionally duplicate MuType {}"\
                                .format(mtype))

                else:
                    use_mtypes.update({mtype})
                    use_sampsets.update({mtype_sampsets[mtype]})

    if args.verbose:
        print("\nFound {} total sub-types!".format(len(use_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(
        sorted(list(use_mtypes)),
        open(os.path.join(
            out_path, 'mtype_list__freq_{}__levels_{}.p'.format(
                args.freq_cutoff, args.mut_levels)
            ), 'wb')
        )

    pickle.dump({'Samps': cdata.samples},
                open(os.path.join(out_path, 'cohort_info.p'), 'wb'))

    with open(os.path.join(
            out_path,
            'mtype_count__freq_{}__levels_{}.txt'.format(
                args.freq_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(use_mtypes)))
Пример #21
0
def main():
    parser = argparse.ArgumentParser(
        "Set up the gene subtype expression effect isolation experiment by "
        "enumerating the subtypes to be tested.")

    # create positional command line arguments
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")
    parser.add_argument('gene', type=str, help="which gene to consider")
    parser.add_argument('mut_levels',
                        type=str,
                        help="the mutation property levels to consider")

    # create optional command line arguments
    parser.add_argument('--samp_cutoff',
                        type=int,
                        default=20,
                        help='subtype sample frequency threshold')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse command line arguments, create directory where found subtypes
    # will be stored
    args = parser.parse_args()
    use_lvls = args.mut_levels.split('__')
    out_path = os.path.join(base_dir, 'setup', args.cohort, args.gene)
    os.makedirs(out_path, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # load expression and variant call data for the given TCGA cohort
    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=use_lvls,
                           expr_source='Firehose',
                           var_source='mc3',
                           expr_dir=firehose_dir,
                           cv_prop=1.0,
                           syn=syn)

    if args.verbose:
        print("Looking for combinations of subtypes of mutations in gene {} "
              "present in at least {} of the samples in TCGA cohort {} at "
              "annotation levels {}.\n".format(args.gene, args.samp_cutoff,
                                               args.cohort, use_lvls))

    # find mutation subtypes present in enough samples in the TCGA cohort
    iso_mtypes = cdata.train_mut.find_unique_subtypes(
        max_types=1000,
        max_combs=5,
        verbose=2,
        sub_levels=use_lvls,
        min_type_size=args.samp_cutoff)

    # filter out the subtypes that appear in too many samples for there to
    # be a wild-type class of sufficient size for classification
    use_mtypes = {
        mtype
        for mtype in iso_mtypes
        if (len(mtype.get_samples(cdata.train_mut)) <= (len(cdata.samples) -
                                                        args.samp_cutoff))
    }

    if args.verbose:
        print("\nFound {} total sub-types to isolate!".format(len(use_mtypes)))

    # save the list of found non-duplicate subtypes to file
    pickle.dump(
        sorted(use_mtypes),
        open(
            os.path.join(
                out_path, 'mtypes_list__samps_{}__levels_{}.p'.format(
                    args.samp_cutoff, args.mut_levels)), 'wb'))

    # save the number of found subtypes to file
    with open(
            os.path.join(
                out_path, 'mtypes_count__samps_{}__levels_{}.txt'.format(
                    args.samp_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(use_mtypes)))
Пример #22
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('expr_source', type=str,
                        choices=['Firehose', 'toil', 'toil_tx'],
                        help='which TCGA expression data source to use')
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")

    parser.add_argument(
        'syn_root', type=str,
        help="the root cache directory for data downloaded from Synapse"
        )

    parser.add_argument(
        'samp_cutoff', type=int,
        help="minimum number of mutated samples needed to test a gene"
        )

    parser.add_argument('classif', type=str,
                        help='the name of a mutation classifier')
    
    parser.add_argument(
        '--cv_id', type=int, default=6732,
        help='the random seed to use for cross-validation draws'
        )
 
    parser.add_argument(
        '--task_count', type=int, default=10,
        help='how many parallel tasks the list of types to test is split into'
        )
    parser.add_argument('--task_id', type=int, default=0,
                        help='the subset of subtypes to assign to this task')

    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    # parse command-line arguments, create directory where to save results
    args = parser.parse_args()
    out_path = os.path.join(
        base_dir, 'output', args.expr_source,
        '{}__samps-{}'.format(args.cohort, args.samp_cutoff), args.classif
        )

    gene_list = pickle.load(
        open(os.path.join(base_dir, "setup",
                          "genes-list_{}__{}__samps-{}.p".format(
                              args.expr_source, args.cohort,
                              args.samp_cutoff
                            )),
             'rb')
        )

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = args.syn_root
    syn.login()
 
    expr_dir = pd.read_csv(
        open(os.path.join(base_dir, 'expr_sources.txt'), 'r'),
        sep='\t', header=None, index_col=0
        ).loc[args.expr_source].iloc[0]

    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=gene_list, mut_levels=['Gene'],
        expr_source=args.expr_source, expr_dir=expr_dir, var_source='mc3',
        syn=syn, cv_prop=0.75, cv_seed=2079 + 57 * args.cv_id
        )

    clf_info = args.classif.split('__')
    clf_module = import_module(
        'HetMan.experiments.gene_baseline.models.{}'.format(clf_info[0]))
    mut_clf = getattr(clf_module, clf_info[1].capitalize())

    out_auc = {mut_gene: None for mut_gene in gene_list}
    out_aupr = {mut_gene: None for mut_gene in gene_list}
    out_params = {mut_gene: None for mut_gene in gene_list}
    out_time = {mut_gene: None for mut_gene in gene_list}

    for i, mut_gene in enumerate(gene_list):
        if (i % args.task_count) == args.task_id:
            if args.verbose:
                print("Testing {} ...".format(mut_gene))

            clf = mut_clf()
            mtype = MuType({('Gene', mut_gene): None})

            clf.tune_coh(cdata, mtype, exclude_genes={mut_gene},
                         tune_splits=4, test_count=24, parallel_jobs=16)
            out_params[mut_gene] = {par: clf.get_params()[par]
                                    for par, _ in mut_clf.tune_priors}

            t_start = time.time()
            clf.fit_coh(cdata, mtype, exclude_genes={mut_gene})
            t_end = time.time()
            out_time[mut_gene] = t_end - t_start

            test_omics, test_pheno = cdata.test_data(
                mtype, exclude_genes={mut_gene})
            pred_scores = clf.predict_omic(test_omics)

            if len(set(test_pheno)) == 2:
                out_auc[mut_gene] = roc_auc_score(test_pheno, pred_scores)
                out_aupr[mut_gene] = average_precision_score(
                    test_pheno, pred_scores)

            else:
                out_auc[mut_gene] = 0.5
                out_aupr[mut_gene] = len(mtype.get_samples(cdata.train_mut))
                out_aupr[mut_gene] /= len(cdata.train_samps)

        else:
            del(out_auc[mut_gene])
            del(out_aupr[mut_gene])
            del(out_params[mut_gene])
            del(out_time[mut_gene])

    pickle.dump(
        {'AUC': out_auc, 'AUPR': out_aupr,
         'Clf': mut_clf, 'Params': out_params, 'Time': out_time},
        open(os.path.join(out_path,
                          'out__cv-{}_task-{}.p'.format(
                              args.cv_id, args.task_id)),
             'wb')
        )
Пример #23
0
def main():
    parser = argparse.ArgumentParser(
        "Set up the gene subtype expression effect cross-isolation "
        "experiment by enumerating the pairs of subtypes to be tested.")

    # create positional command line arguments
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")
    parser.add_argument('gene', type=str, help="which gene to consider")
    parser.add_argument('mut_levels',
                        type=str,
                        help='the mutation property levels to consider')

    # create optional command line arguments
    parser.add_argument('--samp_cutoff',
                        type=int,
                        default=25,
                        help='subtype sample frequency threshold')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse command line arguments, create directory where found subtypes
    # will be stored
    args = parser.parse_args()
    use_lvls = args.mut_levels.split('__')
    out_path = os.path.join(base_dir, 'setup', args.cohort, args.gene)
    os.makedirs(out_path, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=use_lvls,
                           expr_source='Firehose',
                           var_source='mc3',
                           expr_dir=firehose_dir,
                           cv_prop=1.0,
                           syn=syn)

    if args.verbose:
        print("Looking for combinations of subtypes of mutations in gene {} "
              "present in at least {} of the samples in TCGA cohort {} at "
              "annotation levels {}.\n".format(args.gene, args.samp_cutoff,
                                               args.cohort, use_lvls))

    cross_mtypes = cdata.train_mut.find_unique_subtypes(
        max_types=100,
        max_combs=10,
        verbose=2,
        sub_levels=use_lvls,
        min_type_size=args.samp_cutoff)

    mtype_samps = {
        mtype: mtype.get_samples(cdata.train_mut)
        for mtype in cross_mtypes
    }
    cross_mtypes = {
        mtype
        for mtype in cross_mtypes
        if len(mtype_samps[mtype]) <= (len(cdata.samples) - args.samp_cutoff)
    }

    if args.verbose:
        print("\nFound {} total sub-types to cross!".format(len(cross_mtypes)))

    use_pairs = {
        (mtype1, mtype2)
        for mtype1, mtype2 in combn(cross_mtypes, 2)
        if ((len(mtype_samps[mtype1] - mtype_samps[mtype2]) >= args.samp_cutoff
             ) and (len(mtype_samps[mtype2] -
                        mtype_samps[mtype1]) >= args.samp_cutoff) and (
                            len(mtype_samps[mtype1] | mtype_samps[mtype2]) <=
                            (len(cdata.samples) - args.samp_cutoff)) and (
                                mtype1 & mtype2).is_empty())
    }

    if args.verbose:
        print("\nFound {} non-overlapping sub-type pairs!".format(
            len(use_pairs)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(
        sorted(use_pairs),
        open(
            os.path.join(
                out_path, 'pairs_list__samps_{}__levels_{}.p'.format(
                    args.samp_cutoff, args.mut_levels)), 'wb'))

    with open(
            os.path.join(
                out_path, 'pairs_count__samps_{}__levels_{}.txt'.format(
                    args.samp_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(use_pairs)))
Пример #24
0
def main():
    parser = argparse.ArgumentParser(
        "Set up the paired-gene subtype expression effect isolation "
        "experiment by enumerating the subtypes to be tested.")

    # create positional command line arguments
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")
    parser.add_argument('mut_levels',
                        type=str,
                        help="the mutation property levels to consider")
    parser.add_argument('genes',
                        type=str,
                        nargs='+',
                        help="a list of mutated genes")

    # create optional command line arguments
    parser.add_argument('--samp_cutoff',
                        type=int,
                        default=20,
                        help='subtype sample frequency threshold')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse command line arguments, create directory where found subtypes
    # will be stored
    args = parser.parse_args()
    use_lvls = args.mut_levels.split('__')
    out_path = os.path.join(base_dir, 'setup', args.cohort,
                            '_'.join(args.genes))
    os.makedirs(out_path, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = syn_root
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=args.genes,
                           mut_levels=['Gene'] + use_lvls,
                           expr_source='Firehose',
                           var_source='mc3',
                           copy_source='Firehose',
                           annot_file=annot_file,
                           expr_dir=expr_dir,
                           domain_dir=domain_dir,
                           cv_prop=1.0,
                           syn=syn)

    iso_mtypes = set()
    for gene in args.genes:
        other_samps = reduce(or_, [
            cdata.train_mut[other_gn].get_samples()
            for other_gn in set(args.genes) - {gene}
        ])

        if args.verbose:
            print("Looking for combinations of subtypes of mutations in gene "
                  "{} present in at least {} of the samples in TCGA cohort "
                  "{} at annotation levels {}.\n".format(
                      gene, args.samp_cutoff, args.cohort, use_lvls))

        pnt_mtypes = cdata.train_mut[gene]['Point'].find_unique_subtypes(
            max_types=500,
            max_combs=2,
            verbose=2,
            sub_levels=use_lvls,
            min_type_size=args.samp_cutoff)

        # filter out the subtypes that appear in too many samples for there to
        # be a wild-type class of sufficient size for classification
        pnt_mtypes = {
            MuType({('Scale', 'Point'): mtype})
            for mtype in pnt_mtypes
            if (len(mtype.get_samples(cdata.train_mut[gene]['Point'])) <= (
                len(cdata.samples) - args.samp_cutoff))
        }
        pnt_mtypes |= {MuType({('Scale', 'Point'): None})}

        cna_mtypes = cdata.train_mut[gene]['Copy'].branchtypes(
            min_size=args.samp_cutoff)
        cna_mtypes |= {MuType({('Copy', ('HetGain', 'HomGain')): None})}
        cna_mtypes |= {MuType({('Copy', ('HetDel', 'HomDel')): None})}

        cna_mtypes = {
            MuType({('Scale', 'Copy'): mtype})
            for mtype in cna_mtypes
            if (len(mtype.get_samples(cdata.train_mut[gene]['Copy'])) <= (
                len(cdata.samples) - args.samp_cutoff))
        }

        all_mtype = MuType(cdata.train_mut[gene].allkey())
        use_mtypes = pnt_mtypes | cna_mtypes

        only_mtypes = {
            (MuType({('Gene', gene): mtype}), )
            for mtype in use_mtypes if (len(
                mtype.get_samples(cdata.train_mut[gene]) -
                (all_mtype - mtype).get_samples(cdata.train_mut[gene]) -
                other_samps) >= args.samp_cutoff)
        }

        comb_mtypes = {(MuType({('Gene', gene):
                                mtype1}), MuType({('Gene', gene): mtype2}))
                       for mtype1, mtype2 in combn(use_mtypes, 2)
                       if ((mtype1 & mtype2).is_empty() and (
                           len((mtype1.get_samples(cdata.train_mut[gene])
                                & mtype2.get_samples(cdata.train_mut[gene])) -
                               (mtype1.get_samples(cdata.train_mut[gene])
                                ^ mtype2.get_samples(cdata.train_mut[gene])) -
                               (all_mtype - mtype1 -
                                mtype2).get_samples(cdata.train_mut[gene]) -
                               other_samps) >= args.samp_cutoff))}

        iso_mtypes |= only_mtypes | comb_mtypes
        if args.verbose:
            print(
                "\nFound {} exclusive sub-types and {} combination sub-types "
                "to isolate!".format(len(only_mtypes), len(comb_mtypes)))

    for cur_genes in chain.from_iterable(
            combn(args.genes, r) for r in range(1, len(args.genes))):
        gene_mtype = MuType({('Gene', cur_genes): None})
        rest_mtype = MuType({
            ('Gene', tuple(set(args.genes) - set(cur_genes))):
            None
        })

        if (args.samp_cutoff <= len(
                gene_mtype.get_samples(cdata.train_mut) -
                rest_mtype.get_samples(cdata.train_mut)) <=
            (len(cdata.samples) - args.samp_cutoff)):
            iso_mtypes |= {(gene_mtype, )}

    if args.verbose:
        print("\nFound {} total sub-types to isolate!".format(len(iso_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(
        sorted(iso_mtypes),
        open(
            os.path.join(
                out_path, 'mtypes_list__samps_{}__levels_{}.p'.format(
                    args.samp_cutoff, args.mut_levels)), 'wb'))

    with open(
            os.path.join(
                out_path, 'mtypes_count__samps_{}__levels_{}.txt'.format(
                    args.samp_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(iso_mtypes)))
Пример #25
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description=("Test a classifier's ability to predict the presence "
                     "of a list of sub-types."))

    # positional command line arguments for where input data and output
    # data is to be stored
    parser.add_argument('mtype_file',
                        type=str,
                        help='the pickle file where sub-types are stored')
    parser.add_argument('out_dir',
                        type=str,
                        help='where to save the output of testing sub-types')

    # positional arguments for which cohort of samples and which mutation
    # classifier to use for testing
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif',
                        type=str,
                        help='a classifier in HetMan.predict.classifiers')

    # positional arguments controlling CV and task selection
    parser.add_argument('cv_id',
                        type=int,
                        help='a random seed used for cross-validation')
    parser.add_argument('task_id',
                        type=int,
                        help='the subset of sub-types to assign to this task')

    parser.add_argument(
        '--task_count',
        type=int,
        default=10,
        help='how many parallel tasks the list of types to test is split into')

    # optional arguments controlling how classifier tuning is to be performed
    parser.add_argument(
        '--tune_splits',
        type=int,
        default=4,
        help='how many training cohort splits to use for tuning')
    parser.add_argument(
        '--test_count',
        type=int,
        default=16,
        help='how many hyper-parameter values to test in each tuning split')
    parser.add_argument(
        '--parallel_jobs',
        type=int,
        default=8,
        help='how many parallel CPUs to allocate the tuning tests across')

    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    if args.verbose:
        print("Starting testing for sub-types in\n{}\nwith "
              "cross-validation ID {} and task ID {} ...".format(
                  args.mtype_file, args.cv_id, args.task_id))

    mtype_list = sorted(pickle.load(open(args.mtype_file, 'rb')))
    out_file = os.path.join(
        args.out_dir, 'out__cv-{}_task-{}.p'.format(args.cv_id, args.task_id))

    # loads the pipeline used for classifying variants, gets the mutated
    # genes for each variant under consideration
    mut_clf = eval(args.classif)
    use_genes = reduce(
        or_,
        [set(gn for gn, _ in mtype.subtype_list()) for mtype in mtype_list])

    # logs into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # loads the expression data and gene mutation data for the given TCGA
    # cohort, with the training/testing cohort split defined by the
    # cross-validation id for this task
    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=list(use_genes),
                           mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'],
                           expr_source='Firehose',
                           expr_dir=firehose_dir,
                           syn=syn,
                           cv_seed=(args.cv_id + 3) * 19,
                           cv_prop=2.0 / 3)

    if args.verbose:
        print("Loaded {} sub-types over {} genes which will be tested using "
              "classifier {} in cohort {} with {} samples.".format(
                  len(mtype_list), len(use_genes), args.classif, args.cohort,
                  len(cdata.samples)))

    # initialize the dictionaries that will store classification
    # performances and hyper-parameter values
    out_acc = {mtype: -1 for mtype in mtype_list}
    out_par = {mtype: None for mtype in mtype_list}

    # for each sub-variant, check if it has been assigned to this task
    for i, mtype in enumerate(mtype_list):
        if (i % args.task_count) == args.task_id:

            if args.verbose:
                print("Testing {} ...".format(mtype))

            # gets the genes that this variant mutates, initializes the
            # classification pipeline
            ex_genes = set(gn for gn, _ in mtype.subtype_list())
            clf = mut_clf()

            # tunes the classifier using the training cohort
            clf.tune_coh(cdata,
                         mtype,
                         exclude_genes=ex_genes,
                         tune_splits=args.tune_splits,
                         test_count=args.test_count,
                         parallel_jobs=args.parallel_jobs)
            out_par[mtype] = {
                par: clf.get_params()[par]
                for par, _ in clf.tune_priors
            }

            # fits the tuned classifier on the training cohort, evaluates its
            # performance on the testing cohort and saves the results
            clf.fit_coh(cdata, mtype, exclude_genes=ex_genes)
            out_acc[mtype] = clf.eval_coh(cdata, mtype, exclude_genes=ex_genes)

        else:
            del (out_acc[mtype])
            del (out_par[mtype])

    # saves the performance measurements and tuned hyper-parameter values
    # for each sub-type to file
    pickle.dump(
        {
            'Acc': out_acc,
            'Par': out_par,
            'Info': {
                'TuneSplits': args.tune_splits,
                'TestCount': args.test_count,
                'ParallelJobs': args.parallel_jobs
            }
        }, open(out_file, 'wb'))
Пример #26
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        "Isolate the expression signatures of pairs of mutation subtypes "
        "against one another from their parent gene(s)' signature or that of "
        "a list of genes in a given TCGA cohort."
        )

    # positional command line arguments for where input data and output
    # data is to be stored
    parser.add_argument('mtype_file', type=str,
                        help='the pickle file where sub-types are stored')
    parser.add_argument('out_dir', type=str,
                        help='where to save the output of testing sub-types')

    # positional arguments for which cohort of samples and which mutation
    # classifier to use for testing
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif', type=str,
                        help='a classifier in HetMan.predict.classifiers')

    parser.add_argument(
        '--cv_id', type=int, default=4309,
        help='the random seed to use for cross-validation draws'
        )

    parser.add_argument(
        '--task_count', type=int, default=10,
        help='how many parallel tasks the list of types to test is split into'
        )
    parser.add_argument('--task_id', type=int, default=0,
                        help='the subset of subtypes to assign to this task')

    # optional arguments controlling how classifier tuning is to be performed
    parser.add_argument(
        '--tune_splits', type=int, default=4,
        help='how many training cohort splits to use for tuning'
        )
    parser.add_argument(
        '--test_count', type=int, default=16,
        help='how many hyper-parameter values to test in each tuning split'
        )

    parser.add_argument(
        '--infer_splits', type=int, default=20,
        help='how many cohort splits to use for inference bootstrapping'
        )
    parser.add_argument(
        '--infer_folds', type=int, default=4,
        help=('how many parts to split the cohort into in each inference '
              'cross-validation run')
        )

    parser.add_argument(
        '--parallel_jobs', type=int, default=4,
        help='how many parallel CPUs to allocate the tuning tests across'
        )

    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    out_file = os.path.join(args.out_dir,
                            'out__task-{}.p'.format(args.task_id))

    pair_list = pickle.load(open(args.mtype_file, 'rb'))
    use_lvls = []

    for lvls in reduce(or_, [{(mtype1 | mtype2).get_sorted_levels()}
                             for mtype1, mtype2 in pair_list]):
        for lvl in lvls:
            if lvl not in use_lvls:
                use_lvls.append(lvl)

    if args.verbose:
        print("Starting paired isolation for sub-types in\n{}\n at "
              "annotation levels {}, the results of which will be stored "
              "in\n{}\nin cohort {} with classifier <{}>.".format(
                  args.mtype_file, use_lvls, args.out_dir,
                  args.cohort, args.classif
                ))

    use_genes = reduce(or_, [(set(gn for gn, _ in mtype1.subtype_list())
                              | set(gn for gn, _ in mtype2.subtype_list()))
                             for mtype1, mtype2 in pair_list])

    if args.classif[:6] == 'Stan__':
        use_module = import_module('HetMan.experiments.utilities'
                                   '.stan_models.{}'.format(
                                       args.classif.split('Stan__')[1]))
        mut_clf = getattr(use_module, 'UsePipe')
    
    else:
        mut_clf = eval(args.classif)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # loads the expression data and gene mutation data for the given TCGA
    # cohort, with the training/testing cohort split defined by the
    # cross-validation id for this task
    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=list(use_genes), mut_levels=use_lvls,
        expr_source='Firehose', expr_dir=firehose_dir,
        syn=syn, cv_seed=9999, cv_prop=1.0
        )

    if args.verbose:
        print("Loaded {} pairs of subtypes of which roughly {} will be "
              "isolated in cohort {} with {} samples.".format(
                  len(pair_list), len(pair_list) // args.task_count,
                  args.cohort, len(cdata.samples)
                ))

    out_cross = {(mtype1, mtype2): None for mtype1, mtype2 in pair_list}
    out_cross.update({(mtype2, mtype1): None for mtype1, mtype2 in pair_list})

    # for each subtype, check if it has been assigned to this task
    for i, (mtype1, mtype2) in enumerate(pair_list):
        if (i % args.task_count) == args.task_id:
            clf = mut_clf()

            if args.verbose:
                print("Pairing {} and {} ...".format(mtype1, mtype2))

            samps1 = mtype1.get_samples(cdata.train_mut)
            samps2 = mtype2.get_samples(cdata.train_mut)

            ex_genes = set(gn for gn, _ in mtype1.subtype_list())
            ex_genes |= set(gn for gn, _ in mtype2.subtype_list())

            if len(samps1 | samps2) <= (len(cdata.samples) - 10):

                if 10 <= len(samps1 - samps2):
                    clf.tune_coh(cdata, mtype1, exclude_genes=ex_genes,
                                 exclude_samps=samps2,
                                 tune_splits=args.tune_splits,
                                 test_count=args.test_count,
                                 parallel_jobs=args.parallel_jobs)
                    
                    out_cross[(mtype1, mtype2)] = clf.infer_coh(
                        cdata, mtype1, exclude_genes=ex_genes,
                        force_test_samps=samps2,
                        infer_splits=args.infer_splits,
                        infer_folds=args.infer_folds,
                        parallel_jobs=args.parallel_jobs
                        )

                if 10 <= len(samps2 - samps1):
                    clf.tune_coh(cdata, mtype2, exclude_genes=ex_genes,
                                 exclude_samps=samps1,
                                 tune_splits=args.tune_splits,
                                 test_count=args.test_count,
                                 parallel_jobs=args.parallel_jobs)
                    
                    out_cross[(mtype2, mtype1)] = clf.infer_coh(
                        cdata, mtype2, exclude_genes=ex_genes,
                        force_test_samps=samps1,
                        infer_splits=args.infer_splits,
                        infer_folds=args.infer_folds,
                        parallel_jobs=args.parallel_jobs
                        )

        else:
            del(out_cross[(mtype1, mtype2)])
            del(out_cross[(mtype2, mtype1)])

    pickle.dump(
        {'Infer': out_cross,
         'Info': {'TunePriors': mut_clf.tune_priors,
                  'TuneSplits': args.tune_splits,
                  'TestCount': args.test_count}},
        open(out_file, 'wb')
        )
Пример #27
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up touring for sub-types to detect.')

    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")
    parser.add_argument('gene1', type=str, help="which gene to consider")
    parser.add_argument('gene2', type=str, help="which gene to consider")

    parser.add_argument(
        'mut_levels',
        type=str,
        help='the mutation property levels to consider, in addition to `Gene`')

    parser.add_argument('--samp_cutoff',
                        type=int,
                        default=20,
                        help='subtype sample frequency threshold')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'setup', args.cohort,
                            '{}_{}'.format(args.gene1, args.gene2))

    os.makedirs(out_path, exist_ok=True)
    use_lvls = args.mut_levels.split('__')

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene1, args.gene2],
                           mut_levels=['Gene'] + use_lvls,
                           expr_source='Firehose',
                           var_source='mc3',
                           expr_dir=firehose_dir,
                           cv_prop=1.0,
                           syn=syn)

    cross_mtypes1 = cdata.train_mut[args.gene1].find_unique_subtypes(
        max_types=40,
        max_combs=50,
        verbose=2,
        sub_levels=use_lvls,
        min_type_size=args.samp_cutoff)
    cross_mtypes2 = cdata.train_mut[args.gene2].find_unique_subtypes(
        max_types=40,
        max_combs=50,
        verbose=2,
        sub_levels=use_lvls,
        min_type_size=args.samp_cutoff)

    if args.verbose:
        print("Found {} sub-types of {} and {} sub-types of {} "
              "to cross!".format(len(cross_mtypes1), args.gene1,
                                 len(cross_mtypes2), args.gene2))

    cross_mtypes1 = {
        MuType({('Gene', args.gene1): mtype})
        for mtype in cross_mtypes1
    }
    cross_mtypes2 = {
        MuType({('Gene', args.gene2): mtype})
        for mtype in cross_mtypes2
    }

    samps1 = {
        mtype: mtype.get_samples(cdata.train_mut)
        for mtype in cross_mtypes1
    }
    samps2 = {
        mtype: mtype.get_samples(cdata.train_mut)
        for mtype in cross_mtypes2
    }

    use_pairs = sorted(
        (mtype1, mtype2)
        for mtype1, mtype2 in product(cross_mtypes1, cross_mtypes2)
        if (len(samps1[mtype1] - samps2[mtype2]) >= args.samp_cutoff
            and len(samps2[mtype2] - samps1[mtype1]) >= args.samp_cutoff))

    if args.verbose:
        print("\nSaving {} pairs with sufficient "
              "exclusivity...".format(len(use_pairs)))

    pickle.dump(
        use_pairs,
        open(
            os.path.join(
                out_path, 'pairs_list__samps_{}__levels_{}.p'.format(
                    args.samp_cutoff, args.mut_levels)), 'wb'))

    pickle.dump(
        {(mtype1, mtype2): cdata.mutex_test(mtype1, mtype2)
         for mtype1, mtype2 in use_pairs},
        open(
            os.path.join(
                out_path, 'pairs_mutex__samps_{}__levels_{}.p'.format(
                    args.samp_cutoff, args.mut_levels)), 'wb'))

    pickle.dump({'Samps': cdata.samples},
                open(os.path.join(out_path, 'cohort_info.p'), 'wb'))

    with open(
            os.path.join(
                out_path, 'pairs_count__samps_{}__levels_{}.txt'.format(
                    args.samp_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(use_pairs)))
Пример #28
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('model_name', type=str,
                        help='the name of a Stan model')
    parser.add_argument(
        'solve_method', type=str,
        help='the method used for optimizing the parameters of the Stan model'
        )

    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('gene', type=str, help='a gene with mutated samples')

    parser.add_argument('cv_id', type=int,
                        help='a random seed used for cross-validation')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'output', args.model_name,
                            args.solve_method, args.cohort, args.gene)

    if args.verbose:
        print("Starting distribution testing for Stan model {} using "
              "optimization method {} on mutated gene {} in TCGA cohort {} "
              "for cross-validation ID {} ...".format(
                  args.model_name, args.solve_method,
                  args.cohort, args.gene, args.cv_id
                ))

    use_mtype = MuType({('Gene', args.gene): None})
    use_module = import_module('HetMan.experiments.stan_test'
                               '.distr.models.{}'.format(args.model_name))
    UsePipe = getattr(use_module, 'UsePipe')

    if args.solve_method == 'optim':
        clf_stan = getattr(use_module, 'UsePipe')(
            getattr(use_module, 'UseOptimizing')(
                model_code=getattr(use_module, 'use_model'))
            )

    elif args.solve_method == 'variat':
        clf_stan = getattr(use_module, 'UsePipe')(
            getattr(use_module, 'UseVariational')(
                model_code=getattr(use_module, 'use_model'))
            )

    elif args.solve_method == 'sampl':
        clf_stan = getattr(use_module, 'UsePipe')(
            getattr(use_module, 'UseSampling')(
                model_code=getattr(use_module, 'use_model'))
            )

    else:
        raise ValueError("Unrecognized <solve_method> argument!")

    if '_' in args.gene:
        mut_info = args.gene.split('_')
        use_mtype = MuType({('Gene', mut_info[0]): mtype_list[mut_info[1]]})

    else:
        use_mtype = MuType({('Gene', args.gene): None})

    clf_stan = eval("model_dict['{}']".format(args.model_name))
    
    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=[args.gene], mut_levels=['Gene'],
        expr_source='Firehose', expr_dir=firehose_dir, var_source='mc3',
        syn=syn, cv_prop=1.0, cv_seed=1298 + 93 * args.cv_id
        )

    clf_stan.tune_coh(cdata, use_mtype, exclude_genes={args.gene},
                      tune_splits=4, test_count=24, parallel_jobs=12)
    clf_stan.fit_coh(cdata, use_mtype, exclude_genes={args.gene})

    if clf_stan.tune_priors:
        clf_params = clf_stan.get_params()
    else:
        clf_params = None

    infer_mat = clf_stan.infer_coh(
        cdata, use_mtype, exclude_genes={args.gene},
        infer_splits=12, infer_folds=4, parallel_jobs=12
        )

    pickle.dump(
        {'Params': clf_params, 'Infer': infer_mat,
         'Vars': clf_stan.named_steps['fit'].get_var_means()},
        open(os.path.join(out_path, 'out__cv-{}.p'.format(args.cv_id)), 'wb')
        )
Пример #29
0
def main():
    parser = argparse.ArgumentParser(
        "Set up the copy number alteration expression effect isolation "
        "experiment by enumerating alteration score thresholds to be tested.")

    # create command line arguments
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")
    parser.add_argument('gene', type=str, help="which gene to consider")
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse command line arguments, create directory where found thresholds
    # and threshold counts will be stored
    args = parser.parse_args()
    os.makedirs(os.path.join(base_dir, 'setup', 'ctf_lists'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'setup', 'ctf_counts'), exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # load expression, variant call, and copy number alteration data for
    # the given TCGA cohort and mutated gene
    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=[args.gene],
                           mut_levels=['Gene'],
                           expr_source='Firehose',
                           var_source='mc3',
                           expr_dir=firehose_dir,
                           copy_source='Firehose',
                           copy_dir=copy_dir,
                           copy_discrete=False,
                           cv_prop=1.0,
                           syn=syn)

    ctf_list = []
    mut_stat = np.array(cdata.train_mut.status(cdata.copy_data.index))
    mut_pheno = np.array(cdata.train_pheno(MuType({('Gene', args.gene):
                                                   None})))

    copy_vals = cdata.copy_data.loc[~mut_stat, args.gene]
    loss_vals = copy_vals[copy_vals < 0]
    gain_vals = copy_vals[copy_vals > 0]

    loss_step = 20 / len(loss_vals)
    loss_ctfs = np.unique(
        loss_vals.quantile(np.arange(loss_step, 1, loss_step)))

    gain_step = 20 / len(gain_vals)
    gain_ctfs = np.unique(
        gain_vals.quantile(np.arange(gain_step, 1, gain_step)))[::-1]

    for low_ctf, high_ctf in combn(loss_ctfs, 2):
        cna_stat = (~mut_pheno
                    & cdata.train_pheno({
                        'Gene': args.gene,
                        'CNA': 'Loss',
                        'Cutoff': low_ctf
                    }))

        wt_stat = (~mut_pheno
                   & ~cdata.train_pheno({
                       'Gene': args.gene,
                       'CNA': 'Range',
                       'Cutoff': (low_ctf, high_ctf)
                   })
                   & ~cdata.train_pheno({
                       'Gene': args.gene,
                       'CNA': 'Gain',
                       'Cutoff': -high_ctf
                   }))

        if (np.sum(cna_stat) >= 20) & (np.sum(wt_stat) >= 20):
            ctf_list += [(low_ctf, high_ctf)]

    for high_ctf, low_ctf in combn(gain_ctfs, 2):
        cna_stat = (~mut_pheno
                    & cdata.train_pheno({
                        'Gene': args.gene,
                        'CNA': 'Gain',
                        'Cutoff': high_ctf
                    }))

        wt_stat = (~mut_pheno
                   & ~cdata.train_pheno({
                       'Gene': args.gene,
                       'CNA': 'Range',
                       'Cutoff': (low_ctf, high_ctf)
                   })
                   & ~cdata.train_pheno({
                       'Gene': args.gene,
                       'CNA': 'Loss',
                       'Cutoff': -low_ctf
                   }))

        if (np.sum(cna_stat) >= 20) & (np.sum(wt_stat) >= 20):
            ctf_list += [(low_ctf, high_ctf)]

    # save the list of found non-duplicate subtypes to file
    pickle.dump(
        sorted(ctf_list),
        open(
            os.path.join(base_dir, 'setup', 'ctf_lists',
                         '{}_{}.p'.format(args.cohort, args.gene)), 'wb'))

    with open(
            os.path.join(base_dir, 'setup', 'ctf_counts',
                         '{}_{}.txt'.format(args.cohort, args.gene)),
            'w') as fl:

        fl.write(str(len(ctf_list)))
Пример #30
0
def main():
    parser = argparse.ArgumentParser(
        "Set up the paired gene expression effect isolation experiment by "
        "enumerating the dyads of genes to be tested.")

    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")
    parser.add_argument('--samp_cutoff',
                        type=int,
                        default=40,
                        help='subtype sample frequency threshold')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse command line arguments, create directory where found pairs
    # will be stored
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'setup', args.cohort)
    os.makedirs(out_path, exist_ok=True)

    # log into Synapse using locally stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = MutationCohort(cohort=args.cohort,
                           mut_genes=None,
                           mut_levels=['Gene'],
                           expr_source='Firehose',
                           var_source='mc3',
                           expr_dir=firehose_dir,
                           samp_cutoff=args.samp_cutoff,
                           cv_prop=1.0,
                           syn=syn)

    if args.verbose:
        print("Looking for pairs of mutated genes present in at least {} of "
              "the samples in TCGA cohort {} with {} total samples.".format(
                  args.samp_cutoff, args.cohort, len(cdata.samples)))

    gene_pairs = {
        (MuType({('Gene', gn1): None}), MuType({('Gene', gn2): None}))
        for (gn1, muts1), (gn2, muts2) in combn(cdata.train_mut, r=2)
        if (len(muts1 - muts2) >= args.samp_cutoff
            and len(muts2 - muts1) >= args.samp_cutoff
            and len(muts1 | muts2) <= (len(cdata.samples) - args.samp_cutoff))
    }

    if args.verbose:
        print("Found {} pairs of genes to isolate!".format(len(gene_pairs)))

    pickle.dump(
        sorted(gene_pairs),
        open(
            os.path.join(out_path,
                         'pairs_list__samps_{}.p'.format(args.samp_cutoff)),
            'wb'))

    with open(
            os.path.join(out_path,
                         'pairs_count__samps_{}.txt'.format(args.samp_cutoff)),
            'w') as fl:

        fl.write(str(len(gene_pairs)))