Python merge_cohort_data示例，HetMan.experiments.variant_baseline.merge_tests.merge_cohort_data Python示例

示例#1

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the clustering done by an unsupervised learning method on a "
        "TCGA cohort with molecular subtypes highlighted.")

    parser.add_argument('out_dir', type=str)
    parser.add_argument('transform',
                        type=str,
                        choices=list(clust_algs.keys()),
                        help='an unsupervised learning method')
    parser.add_argument('--use_seed', type=int, default=1301)

    args = parser.parse_args()
    np.random.seed(args.use_seed)
    cdata = merge_cohort_data(args.out_dir)
    type_data = pd.read_csv(type_file, sep='\t', index_col=0, comment='#')

    if '_' in cdata.cohort:
        use_cohort = cdata.cohort.split('_')[0]
    else:
        use_cohort = cdata.cohort

    if use_cohort not in type_data.DISEASE.values:
        raise ValueError("The source of this cohort ({}) does not "
                         "match those present in the TCGA subtypes "
                         "file!".format(use_cohort))

    type_data = type_data[type_data.DISEASE == use_cohort]
    trans_expr = clust_algs[args.transform].fit_transform_coh(cdata)

    plot_clustering(trans_expr.copy(), args, cdata, type_data)

示例#2

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the performance and tuning characteristics of a Stan model in "
        "classifying the mutation status of the genes in a given cohort.")

    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")
    parser.add_argument('gene', type=str, help="a mutated gene")
    parser.add_argument('model_name',
                        type=str,
                        help="which mutation classifier was tested")

    args = parser.parse_args()
    out_tag = "{}__{}".format(args.cohort, args.gene)
    os.makedirs(os.path.join(plot_dir, out_tag,
                             args.model_name.split('__')[0]),
                exist_ok=True)

    cdata = merge_cohort_data(os.path.join(base_dir, out_tag))
    with open(
            os.path.join(base_dir, out_tag,
                         "out-data__{}.p".format(args.model_name)),
            'rb') as fl:
        out_dict = pickle.load(fl)

    plot_auc_distribution(out_dict['Fit']['Acc'], args)
    plot_generalization_error(out_dict['Fit']['Acc'], args)
    plot_tuning_profile(out_dict['Tune']['Acc'], args, cdata)

示例#3

0

显示文件

def main():
    parser = argparse.ArgumentParser("Plots general information about a "
                                     "particular run of the experiment.")

    parser.add_argument('expr_source',
                        type=str,
                        help="which TCGA expression data source was used")
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")

    parser.add_argument(
        'samp_cutoff',
        type=int,
        help="minimum number of mutated samples needed to test a gene")

    parser.add_argument('model_name',
                        type=str,
                        help="which mutation classifier was tested")

    args = parser.parse_args()
    out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort,
                                        args.samp_cutoff)

    os.makedirs(os.path.join(
        plot_dir, args.expr_source,
        "{}__samps-{}".format(args.cohort, args.samp_cutoff),
        args.model_name.split('__')[0]),
                exist_ok=True)

    cdata = merge_cohort_data(os.path.join(base_dir, out_tag))
    with bz2.BZ2File(
            os.path.join(base_dir, out_tag,
                         "out-data__{}.p.gz".format(args.model_name)),
            'r') as fl:
        out_dict = pickle.load(fl)

    plot_label_stability(out_dict['Scores'], out_dict['Fit']['test'].AUC, args,
                         cdata)
    plot_label_correlation(out_dict['Scores'], out_dict['Fit']['test'].AUC,
                           args, cdata)

    plot_auc_distribution(out_dict['Fit']['test'].AUC, args)
    plot_acc_quartiles(out_dict['Fit']['test'].AUC,
                       out_dict['Fit']['test'].AUPR, args, cdata)

    plot_tuning_mtype(out_dict['Params'], out_dict['Fit']['test'].AUC,
                      out_dict['Clf'], args, cdata)
    if len(out_dict['Clf'].tune_priors) > 1:
        plot_tuning_mtype_grid(out_dict['Params'], out_dict['Fit']['test'].AUC,
                               out_dict['Clf'], args, cdata)

示例#4

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the success of all models tested in predicting the presence "
        "of the mutations in a given cohort.")

    # parse command-line arguments, create directory to store the plots
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")
    args = parser.parse_args()
    os.makedirs(plot_dir, exist_ok=True)

    # search for experiment output directories corresponding to this cohort
    out_datas = [
        out_file.parts[-2:] for out_file in Path(base_dir).glob(
            "*__{}__samps-*/out-data__*.p.gz".format(args.cohort))
    ]

    # get the experiment output directory for each combination of input
    # expression source and algorithm with the lowest sample incidence cutoff
    out_use = pd.DataFrame([{
        'Source':
        '__'.join(out_data[0].split('__')[:-2]),
        'Samps':
        int(out_data[0].split('__samps-')[1]),
        'Model':
        out_data[1].split('out-data__')[1].split('.p')[0]
    } for out_data in out_datas]).groupby(
        ['Model',
         'Source'])['Samps'].min().reset_index('Model').set_index('Samps',
                                                                  append=True)

    # load the cohort expression and mutation data for each combination of
    # expression source and sample cutoff
    cdata_dict = {(src, ctf): merge_cohort_data(
        os.path.join(base_dir,
                     "{}__{}__samps-{}".format(src, args.cohort, ctf)))
                  for src, ctf in set(out_use.index)}

    # load the experiment output for each combination of source and cutoff
    out_dict = {(src, mdl.values[0]): pickle.load(
        bz2.BZ2File(
            os.path.join(base_dir,
                         "{}__{}__samps-{}".format(src, args.cohort, ctf),
                         "out-data__{}.p.gz".format(mdl.values[0])), 'r'))
                for (src, ctf), mdl in out_use.iterrows()}

    # create the plots
    plot_auc_highlights(out_dict.copy(), args, cdata_dict)
    plot_aupr_time(out_dict.copy(), args)

示例#5

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the performance and tuning characteristics of a model in "
        "classifying the copy number scores of the genes in a given cohort.")

    parser.add_argument('expr_source',
                        type=str,
                        help="which TCGA expression data source was used")
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")

    parser.add_argument(
        'samp_cutoff',
        type=int,
        help="minimum number of mutated samples needed to test a gene")

    parser.add_argument('model_name',
                        type=str,
                        help="which mutation classifier was tested")

    args = parser.parse_args()
    out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort,
                                        args.samp_cutoff)

    os.makedirs(os.path.join(
        plot_dir, args.expr_source,
        "{}__samps-{}".format(args.cohort, args.samp_cutoff),
        args.model_name.split('__')[0]),
                exist_ok=True)

    cdata = merge_cohort_data(os.path.join(base_dir, out_tag))
    with bz2.BZ2File(
            os.path.join(base_dir, out_tag,
                         "out-data__{}.p.gz".format(args.model_name)),
            'r') as fl:
        out_dict = pickle.load(fl)

    plot_label_stability(out_dict['Scores'], out_dict['Fit']['test'].Cor, args,
                         cdata)
    plot_label_correlation(out_dict['Scores'], out_dict['Fit']['test'].Cor,
                           args, cdata, plot_dir)

    plot_cor_distribution(out_dict['Fit']['test'].Cor, args)

    plot_tuning_gene(out_dict['Params'], out_dict['Fit']['test'].Cor,
                     out_dict['Clf'], args, cdata)
    if len(out_dict['Clf'].tune_priors) > 1:
        plot_tuning_gene_grid(out_dict['Params'], out_dict['Fit']['test'].Cor,
                              out_dict['Clf'], args, cdata)

示例#6

0

显示文件

def main():
    parser = argparse.ArgumentParser("Plots the relationships between the "
                                     "outputs of mutation prediction models "
                                     "tested in a given cohort's dataset.")

    parser.add_argument('expr_source',
                        type=str,
                        help="which TCGA expression data source was used")
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")

    args = parser.parse_args()
    os.makedirs(os.path.join(plot_dir, args.expr_source), exist_ok=True)

    out_datas = [
        out_file.parts[-2:] for out_file in Path(
            base_dir).glob("{}__{}__samps-*/out-data__*.p.gz".format(
                args.expr_source, args.cohort))
    ]

    out_use = pd.DataFrame([{
        'Samps':
        int(out_data[0].split('__samps-')[1]),
        'Model':
        out_data[1].split('out-data__')[1].split('.p')[0]
    } for out_data in out_datas]).groupby(['Model'])['Samps'].min()

    cdata_dict = {
        ctf: merge_cohort_data(
            os.path.join(
                base_dir, "{}__{}__samps-{}".format(args.expr_source,
                                                    args.cohort, ctf)))
        for ctf in set(out_use)
    }

    out_dict = {
        mdl: pickle.load(
            bz2.BZ2File(
                os.path.join(
                    base_dir,
                    "{}__{}__samps-{}".format(args.expr_source, args.cohort,
                                              ctf),
                    "out-data__{}.p.gz".format(mdl)), 'r'))
        for mdl, ctf in out_use.iteritems()
    }

    # create the plots
    plot_model_correlation(out_dict.copy(), args, cdata_dict)

示例#7

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the performance and tuning characteristics of a model in "
        "classifying the copy number scores of the genes in a given cohort."
        )

    parser.add_argument('expr_source', type=str,
                        help="which TCGA expression data source was used")
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")
    parser.add_argument('model_name', type=str,
                        help="which mutation classifier was tested")

    args = parser.parse_args()
    os.makedirs(os.path.join(
        plot_dir, '__'.join([args.expr_source, args.cohort]),
        args.model_name.split('__')[0]
        ), exist_ok=True)

    use_ctf = min(
        int(out_file.parts[-2].split('__samps-')[1])
        for out_file in Path(base_dir).glob(
            "{}__{}__samps-*/out-data__{}.p.gz".format(
                args.expr_source, args.cohort, args.model_name)
            )
        )

    out_tag = "{}__{}__samps-{}".format(
        args.expr_source, args.cohort, use_ctf)
    cdata = merge_cohort_data(os.path.join(base_dir, out_tag))

    with bz2.BZ2File(os.path.join(base_dir, out_tag,
                                  "out-data__{}.p.gz".format(
                                      args.model_name)),
                     'r') as fl:
        out_dict = pickle.load(fl)

    plot_generalization_error(out_dict['Fit']['train'].Cor,
                              out_dict['Fit']['test'].Cor, args)
    plot_tuning_distribution(out_dict['Params'], out_dict['Fit']['test'].Cor,
                             out_dict['Clf'], args, cdata)

    plot_tuning_profile(out_dict['Tune']['Acc'], out_dict['Clf'], args, cdata)
    if len(out_dict['Clf'].tune_priors) == 2:
        plot_tuning_profile_grid(out_dict['Tune']['Acc'], out_dict['Clf'],
                                 args, cdata)

示例#8

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the distributions of the labels assigned by a copy number "
        "alteration score regressor for a set of genetic features.")

    parser.add_argument('expr_source',
                        type=str,
                        help="which TCGA expression data source was used")
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")

    parser.add_argument(
        'samp_cutoff',
        type=int,
        help="minimum number of mutated samples needed to test a gene")

    parser.add_argument('model_name',
                        type=str,
                        help="which mutation classifier was tested")

    args = parser.parse_args()
    out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort,
                                        args.samp_cutoff)

    os.makedirs(os.path.join(
        plot_dir, args.expr_source,
        "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name),
                exist_ok=True)

    cdata = merge_cohort_data(os.path.join(base_dir, out_tag))
    with bz2.BZ2File(
            os.path.join(base_dir, out_tag,
                         "out-data__{}.p.gz".format(args.model_name)),
            'r') as fl:
        out_dict = pickle.load(fl)

    auc_vals = out_dict['Fit']['test']['Cor'].quantile(q=0.25, axis=1)
    for gene in auc_vals.index[auc_vals > auc_vals.quantile(q=0.8)]:
        plot_label_distribution(gene, out_dict['Scores'], args, cdata)

示例#9

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plot an example diagram showing how overlap with other types of "
        "mutations can affect a mutation classification task.")

    # parse command line arguments, create directory where plots will be saved
    parser.add_argument('cohort', help='a TCGA cohort')
    parser.add_argument('classif', help='a mutation classifier')
    args = parser.parse_args()
    os.makedirs(os.path.join(plot_dir, args.cohort), exist_ok=True)

    # search for experiment output directories corresponding to this cohort
    out_datas = [
        out_file.parts[-2:] for out_file in Path(base_dir).glob(
            "{}__samps-*/out-data__{}.p".format(args.cohort, args.classif))
    ]

    use_dir = out_datas[np.argmin(
        [int(out_data[0].split('__samps-')[1]) for out_data in out_datas])][0]
    cdata = merge_cohort_data(os.path.join(base_dir, use_dir), use_seed=671)

    # load inferred mutation relationship metrics generated by the experiment
    with open(
            os.path.join(base_dir, use_dir,
                         "out-simil__{}.p".format(args.classif)), 'rb') as f:
        stat_dict, auc_dict, mutex_dict, siml_dict = pickle.load(f)

    gene_df = pd.read_csv(gene_list, sep='\t', skiprows=1, index_col=0)
    use_genes = gene_df.index[(gene_df.loc[:, [
        'Vogelstein', 'SANGER CGC(05/30/2017)', 'FOUNDATION ONE', 'MSK-IMPACT'
    ]] == 'Yes').sum(axis=1) >= 3]

    # find mutation pairs for which the classifier was able to successfully
    # predict the presence of each mutation in isolation from the other
    auc_df = (pd.DataFrame(auc_dict) >= 0.8).all(axis=0)
    use_mtypes = [
        (mtype1, mtype2) for (mtype1, mtype2) in auc_df.index[auc_df]
        if (mtype1.subtype_list()[0][0] in use_genes
            and mtype2.subtype_list()[0][0] in use_genes and (
                mtype1.subtype_list()[0][0] != mtype2.subtype_list()[0][0]))
    ]

    siml_df = pd.DataFrame({
        'Occur':
        pd.Series(mutex_dict)[use_mtypes],
        'SimilMean':
        pd.Series({
            mtypes: siml_dict[mtypes].loc['Other'].mean()
            for mtypes in use_mtypes
        }),
        'SimilDiff':
        pd.Series({
            mtypes: np.abs(siml_dict[mtypes].loc['Other'].diff()[1])
            for mtypes in use_mtypes
        }),
        'SynerMean':
        pd.Series({
            mtypes: siml_dict[mtypes].loc['Both'].mean()
            for mtypes in use_mtypes
        }),
        'SynerDiff':
        pd.Series({
            mtypes: np.abs(siml_dict[mtypes].loc['Both'].diff()[1])
            for mtypes in use_mtypes
        }),
    })

    good_exs = {
        'Conv':
        (siml_df.Occur * siml_df.SimilMean + siml_df.SimilDiff).sort_values(),
        'Divr':
        (siml_df.Occur + siml_df.SimilMean - siml_df.SimilDiff).sort_values()
    }

    with open(
            os.path.join(base_dir, use_dir,
                         "out-data__{}.p".format(args.classif)), 'rb') as f:
        out_infer = pickle.load(f)['Infer'].loc[use_mtypes]

    plot_base_classification(good_exs, stat_dict, out_infer, auc_dict, cdata,
                             args)

示例#10

0

显示文件

def main():
    parser = argparse.ArgumentParser(
        "Plots the performance of a model in predicting the presence of "
        "mutations in cohorts other than the one it was trained on.")

    parser.add_argument('expr_source',
                        type=str,
                        help="which TCGA expression data source was used")
    parser.add_argument('cohort', type=str, help="which TCGA cohort was used")
    parser.add_argument('model_name',
                        type=str,
                        help="which mutation classifier was tested")

    args = parser.parse_args()
    os.makedirs(os.path.join(plot_dir,
                             '__'.join([args.expr_source, args.cohort]),
                             args.model_name.split('__')[0]),
                exist_ok=True)

    use_ctf = min(
        int(out_file.parts[-2].split('__samps-')[1]) for out_file in Path(
            base_dir).glob("{}__{}__samps-*/out-data__{}.p.gz".format(
                args.expr_source, args.cohort, args.model_name)))

    out_tag = "{}__{}__samps-{}".format(args.expr_source, args.cohort, use_ctf)
    cdata = merge_cohort_data(os.path.join(base_dir, out_tag))

    with bz2.BZ2File(
            os.path.join(base_dir, out_tag,
                         "out-data__{}.p.gz".format(args.model_name)),
            'r') as fl:
        out_dict = pickle.load(fl)

    auc_vals = out_dict['Fit']['test'].AUC.quantile(q=0.25, axis=1)
    use_mtypes = auc_vals[auc_vals >= 0.7].index

    stat_dict = dict()
    for coh, trnsf_df in out_dict['Trnsf'].items():
        stat_dict[coh] = dict()

        with open(
                os.path.join(os.environ['TEMPDIR'], 'HetMan',
                             'variant_baseline', args.expr_source, 'setup',
                             "{}__cohort-data.p".format(coh)), 'rb') as f:
            trnsf_cdata = pickle.load(f)

        if coh in args.cohort:
            sub_stat = np.array([
                smp in cdata.get_train_samples()
                for smp in trnsf_cdata.get_train_samples()
            ])

            if (~sub_stat).any():
                out_dict['Trnsf'][coh] = out_dict['Trnsf'][coh].iloc[
                    ~sub_stat, :]

                for mtype in use_mtypes:
                    trnsf_stat = np.array(trnsf_cdata.train_pheno(mtype))
                    stat_dict[coh][mtype] = trnsf_stat[~sub_stat]

                    assert (np.bincount(trnsf_stat[sub_stat]) == np.bincount(
                        cdata.train_pheno(mtype))).all(), (
                            "{} cohort used for transfer learning does "
                            "not match the one used for primary learning!")

            else:
                del (out_dict['Trnsf'][coh])

        else:
            for mtype in use_mtypes:
                stat_dict[coh][mtype] = np.array(
                    trnsf_cdata.train_pheno(mtype))

    corr_df = pd.DataFrame.from_records({
        coh: {
            mtype: {
                'random':
                trnsf_vals[mtype].iloc[:, :25].corr(method='spearman'),
                'fivefold':
                trnsf_vals[mtype].iloc[:, 25:50].corr(method='spearman'),
                'all':
                trnsf_vals[mtype].corr(method='spearman'),
            }
            for mtype, mut_stat in stat_dict[coh].items()
            if mut_stat.sum() >= 20
        }
        for coh, trnsf_vals in out_dict['Trnsf'].items()
    })

    auc_df = pd.DataFrame.from_records({
        coh: {
            mtype:
            (np.greater.outer(trnsf_vals[mtype].iloc[mut_stat, :-1],
                              trnsf_vals[mtype].iloc[~mut_stat, :-1]).mean() +
             np.equal.outer(trnsf_vals[mtype].iloc[mut_stat, :-1],
                            trnsf_vals[mtype].iloc[~mut_stat, :-1]).mean() / 2)
            for mtype, mut_stat in stat_dict[coh].items()
            if mut_stat.sum() >= 20
        }
        for coh, trnsf_vals in out_dict['Trnsf'].items()
    })

    auc_df = auc_df.iloc[:, ~auc_df.isna().all().values]
    auc_df['All'] = -1.

    for mtype in auc_df.index:
        mut_arr = [
            trnsf_vals[mtype].iloc[stat_dict[coh][mtype], :-1]
            for coh, trnsf_vals in out_dict['Trnsf'].items()
        ]
        mut_vals = np.concatenate([vals.values.flatten() for vals in mut_arr])

        wt_arr = [
            trnsf_vals[mtype].iloc[~stat_dict[coh][mtype], :-1]
            for coh, trnsf_vals in out_dict['Trnsf'].items()
        ]
        wt_vals = np.concatenate([vals.values.flatten() for vals in wt_arr])

        auc_df.loc[mtype, 'All'] = np.greater.outer(mut_vals, wt_vals).mean()
        auc_df.loc[mtype,
                   'All'] += np.equal.outer(mut_vals, wt_vals).mean() / 2

    plot_transfer_aucs(auc_df, auc_vals, stat_dict, args)
    plot_label_stability(corr_df, auc_df, auc_vals, stat_dict, args)
    plot_auc_comparison(out_dict, stat_dict, auc_vals, args)