Python MuType примеры использования

Язык программирования: Python

Пространство имен/Пакет: HetMan.features.variants

Класс/Тип: MuType

Примеров на hotexamples.com: 7

Python MuType - 7 примеров найдено. Это лучшие примеры Python кода для HetMan.features.variants.MuType, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MuType(7)

get_samples(2)

Основные методы

MuType (7)

get_samples (2)

Пример #1

Показать файл

def main(argv):
    """Runs the experiment."""

    syn = synapseclient.Synapse()
    syn.login("grzadkow")
    cdata = VariantCohort(syn,
                          'TCGA-OV',
                          mut_genes=['TTN'],
                          mut_levels=('Gene', 'Form', 'Exon'),
                          cv_info={
                              'Prop': 0.8,
                              'Seed': argv[-1]
                          })
    cdata.train_expr_ = cdata.train_expr_.sort_index()

    prot_data = pd.read_csv(in_path + 'PNNL-causality-formatted.txt.zip',
                            sep='\t')
    prot_vec = prot_data.ix[prot_data['ID'] == 'TTN', :]
    prot_vec = prot_vec.loc[:, prot_vec.columns.isin(cdata.train_expr_.index)]
    prot_vec = prot_vec.dropna(axis=1)
    use_indx = cdata.train_expr_.index.isin(prot_vec.columns)

    base_cor = spearmanr(
        np.array(prot_vec)[0],
        np.array(cdata.train_expr_.ix[prot_vec.columns, 'TTN']))

    mtypes = [
        MuType({('Gene', 'TTN'): {
                    ('Form', 'Missense_Mutation'): None
                }}),
        MuType({('Gene', 'TTN'): {
                    ('Form', 'Nonsense_Mutation'): None
                }}),
    ]

    mut_list = [
        cdata.train_mut_.status(cdata.train_expr_.index, mtype)
        for mtype in mtypes
    ]

    clf = MKBMTL(path_keys={(((), ('controls-state-change-of', )), )})
    clf.named_steps['fit'].R = 5
    clf.fit_coh(cohort=cdata, mtypes=mtypes)
    H_cor = [
        spearmanr(clf.named_steps['fit'].H_mat['mu'][i, use_indx],
                  np.array(prot_vec)[0])
        for i in range(clf.named_steps['fit'].R)
    ]

    print(clf.named_steps['fit'].bw_mat['mu'].round(2))
    print(clf.eval_coh(cohort=cdata, mtypes=mtypes))

    # saves classifier results to file
    out_file = out_path + argv[0] + '_' + argv[1] + '__run' + argv[-1] + '.p'
    print(out_file)
    out_data = {'H_cor': H_cor, 'base': base_cor}
    pickle.dump(out_data, open(out_file, 'wb'))

Пример #2

Показать файл

def main(argv):
    """Runs the experiment."""

    # gets the directory where output will be saved and the name of the TCGA
    # cohort under consideration, loads the list of gene sub-variants
    print(argv)
    out_dir = os.path.join(base_dir, 'output', argv[0], argv[1], argv[2])
    coh_lbl = 'TCGA-{}'.format(argv[0])

    # loads the expression data and gene mutation data for the given TCGA
    # cohort, with the training/testing cohort split defined by the
    # cross-validation id for this task
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = VariantCohort(cohort=coh_lbl,
                          mut_genes=[argv[1]],
                          mut_levels=('Gene', 'Form', 'Exon', 'Location',
                                      'Protein'),
                          syn=syn,
                          cv_seed=(int(argv[3]) + 3) * 17)

    base_mtype = MuType({('Gene', argv[1]): None})
    optim = PartitionOptim(cdata, base_mtype, eval(argv[2]),
                           ('Form', 'Exon', 'Location', 'Protein'))

    while optim.traverse_branch():
        optim_mtypes = optim.best_optim()

    # saves classifier results to file
    out_file = os.path.join(out_dir, 'results', 'out__cv-{}.p'.format(argv[3]))
    pickle.dump(
        {
            'best': optim.best_mtypes,
            'hist': optim.mtype_scores,
            'pred': optim.pred_scores,
            'optim': optim.best_optim()
        }, open(out_file, 'wb'))

Пример #3

Показать файл

def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description=("Test a classifier's ability to predict the presence "
                     "of a list of sub-types."))

    # positional command line arguments
    parser.add_argument('mtype_dir',
                        type=str,
                        help='the folder where sub-types are stored')
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif',
                        type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('base_gene',
                        type=str,
                        help='the gene to cross with respect to')

    parser.add_argument('cv_id',
                        type=int,
                        help='a random seed used for cross-validation')
    parser.add_argument('task_id',
                        type=int,
                        help='the subset of sub-types to assign to this task')

    parser.add_argument(
        '--tune_splits',
        type=int,
        default=8,
        help='how many training cohort splits to use for tuning')
    parser.add_argument(
        '--test_count',
        type=int,
        default=24,
        help='how many hyper-parameter values to test in each tuning split')
    parser.add_argument(
        '--parallel_jobs',
        type=int,
        default=12,
        help='how many parallel CPUs to allocate the tuning tests across')

    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    if args.verbose:
        print("Starting testing for directory\n{}\nwith "
              "cross-validation ID {} and task ID {} ...".format(
                  args.mtype_dir, args.cv_id, args.task_id))

    mtype_list = sorted(
        pickle.load(
            open(os.path.join(args.mtype_dir, 'tmp', 'mtype_list.p'), 'rb')))

    # loads the pipeline used for classifying variants, gets the mutated
    # genes for each variant under consideration
    mut_clf = eval(args.classif)
    use_genes = reduce(
        or_,
        [set(gn for gn, _ in mtype.subtype_list())
         for mtype in mtype_list]) | {args.base_gene}

    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    cdata = VariantCohort(cohort=args.cohort,
                          mut_genes=list(use_genes),
                          mut_levels=['Gene', 'Form_base', 'Exon', 'Protein'],
                          expr_source='Firehose',
                          data_dir=firehose_dir,
                          syn=syn,
                          cv_seed=(args.cv_id + 53) * 7,
                          cv_prop=2 / 3)

    base_mtype = MuType({('Gene', args.base_gene): None})
    base_train_samps = base_mtype.get_samples(cdata.train_mut)
    base_test_samps = base_mtype.get_samples(cdata.test_mut)

    if args.verbose:
        print("Loaded {} sub-types over {} genes which will be tested using "
              "classifier {} in cohort {} with {} samples.".format(
                  len(mtype_list), len(use_genes), args.classif, args.cohort,
                  len(cdata.samples)))

    out_acc = {mtype: {} for mtype in mtype_list}

    for i, mtype in enumerate(mtype_list):
        if (i % 10) == args.task_id:

            if args.verbose:
                print("Testing {} ...".format(mtype))

            ex_genes = set(gn for gn, _ in mtype.subtype_list())
            clf = mut_clf()

            cur_train_samps = mtype.get_samples(cdata.train_mut)
            cur_test_samps = mtype.get_samples(cdata.test_mut)

            clf.tune_coh(cdata,
                         mtype,
                         exclude_genes=ex_genes,
                         tune_splits=args.tune_splits,
                         test_count=args.test_count,
                         parallel_jobs=args.parallel_jobs)

            clf.fit_coh(cdata, mtype, exclude_genes=ex_genes)
            out_acc[mtype]['Base'] = clf.eval_coh(cdata,
                                                  mtype,
                                                  exclude_genes=ex_genes)

            if (len(cur_train_samps - base_train_samps) > 3
                    and len(cur_test_samps - base_test_samps) > 3):

                print("Null test {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             exclude_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            exclude_samps=base_train_samps)
                out_acc[mtype]['Null'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    exclude_samps=base_test_samps)

            if (len(cur_train_samps & base_train_samps) > 3
                    and len(cur_test_samps & base_test_samps) > 3):

                print("Mut test {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             include_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            include_samps=base_train_samps)
                out_acc[mtype]['Mut'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    include_samps=base_test_samps)

            if (len(cur_train_samps - base_train_samps) > 3
                    and len(cur_test_samps & base_test_samps) > 3):

                print("Null cross {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             exclude_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            exclude_samps=base_train_samps)
                out_acc[mtype]['NullX'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    include_samps=base_test_samps)

            if (len(cur_train_samps & base_train_samps) > 3
                    and len(cur_test_samps - base_test_samps) > 3):

                print("Mut cross {}".format(mtype))
                clf.tune_coh(cdata,
                             mtype,
                             exclude_genes=ex_genes,
                             tune_splits=args.tune_splits,
                             include_samps=base_train_samps,
                             test_count=args.test_count,
                             parallel_jobs=args.parallel_jobs)

                clf.fit_coh(cdata,
                            mtype,
                            exclude_genes=ex_genes,
                            include_samps=base_train_samps)
                out_acc[mtype]['MutX'] = clf.eval_coh(
                    cdata,
                    mtype,
                    exclude_genes=ex_genes,
                    exclude_samps=base_test_samps)

        else:
            del (out_acc[mtype])

    # saves the performance measurements for each variant to file
    out_file = os.path.join(
        args.mtype_dir, 'results',
        'out__cv-{}_task-{}.p'.format(args.cv_id, args.task_id))
    pickle.dump(
        {
            'Acc': out_acc,
            'Info': {
                'TuneSplits': args.tune_splits,
                'TestCount': args.test_count,
                'ParallelJobs': args.parallel_jobs
            }
        }, open(out_file, 'wb'))

Пример #4

Показать файл

def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up touring for sub-types to detect.'
        )
    parser.add_argument('cohort', type=str, help="which TCGA cohort to use")

    # optional command line arguments controlling the thresholds for which
    # individual mutations and how many genes' mutations are considered
    parser.add_argument('--freq_cutoff', type=float, default=0.02,
                        help='subtype sample frequency threshold')

    # optional command line arguments for what kinds of mutation sub-types to
    # look for in terms of properties and number of mutations to combine
    parser.add_argument('--mut_levels', type=str, default='Gene',
                        help='the mutation property levels to consider')

    # optional command line argument controlling verbosity
    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'setup', args.cohort)
    os.makedirs(out_path, exist_ok=True)
    use_lvls = args.mut_levels.split('__')

    # log into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()
    
    cdata = MutationCohort(
        cohort=args.cohort, mut_genes=None, mut_levels=use_lvls,
        expr_source='Firehose', var_source='mc3', expr_dir=firehose_dir,
        cv_prop=1.0, samp_cutoff=args.freq_cutoff, syn=syn
        )

    if args.verbose:
        print("Found {} candidate genes with mutations in at least "
              "{:.1f}% of the samples in TCGA cohort {}.\nLooking for "
              "subtypes of these genes that are combinations of up to two "
              "mutations at annotation levels {} ...\n".format(
                  len(tuple(cdata.train_mut)), args.freq_cutoff * 100,
                  args.cohort, use_lvls
                )
             )
    
    min_samps = args.freq_cutoff * len(cdata.samples)
    if use_lvls == ['Gene']:

        use_mtypes = {MuType({('Gene', gn): None})
                      for gn, mut in cdata.train_mut
                      if len(mut) >= min_samps}

    elif use_lvls[0] == 'Gene':
        use_lvls = use_lvls[1:]

        use_mtypes = set()
        use_sampsets = set()
        mtype_sampsets = dict()

        for gn, mut in cdata.train_mut:
            cur_mtypes = {
                MuType({('Gene', gn): mtype})
                for mtype in mut.combtypes(comb_sizes=(1, 2),
                                           sub_levels=use_lvls,
                                           min_type_size=min_samps)
                }

            # finds the samples belonging to each enumerated sub-type that
            # hasn't already been found
            cur_sampsets = {
                mtype: frozenset(mtype.get_samples(cdata.train_mut))
                for mtype in cur_mtypes - use_mtypes}

            # removes the sub-types with so many mutated samples that there
            # are not enough negatively-labelled samples for classification
            mtype_sampsets.update({
                mtype: sampset for mtype, sampset in cur_sampsets.items()
                if len(sampset) <= (len(cdata.samples) - min_samps)
                })

        # ensures that when two sub-types have the same samples the one
        # further down the sort order gets removed
        sub_mtypes = sorted(list(mtype_sampsets))
        if args.verbose:
            print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            for i, mtype in enumerate(sub_mtypes):

                if args.verbose and (i % 200) == 100:
                    print("\nchecked {} sub-types\n".format(i))

                # ...we remove each one whose set of mutated samples is
                # identical to that of a sub-type that was already found
                if mtype_sampsets[mtype] in use_sampsets:
                    if args.verbose:
                        print("Removing functionally duplicate MuType {}"\
                                .format(mtype))

                else:
                    use_mtypes.update({mtype})
                    use_sampsets.update({mtype_sampsets[mtype]})

    else:
        cur_mtypes = cdata.train_mut.combtypes(comb_sizes=(1, 2),
                                               sub_levels=use_lvls,
                                               min_type_size=min_samps)

        use_mtypes = set()
        use_sampsets = set()
        mtype_sampsets = dict()

        cur_sampsets = {mtype: frozenset(mtype.get_samples(cdata.train_mut))
                        for mtype in cur_mtypes - use_mtypes}

        # removes the sub-types with so many mutated samples that there
        # are not enough negatively-labelled samples for classification
        mtype_sampsets.update({
            mtype: sampset for mtype, sampset in cur_sampsets.items()
            if len(sampset) <= (len(cdata.samples) - min_samps)
            })

        # ensures that when two sub-types have the same samples the one
        # further down the sort order gets removed
        sub_mtypes = sorted(list(mtype_sampsets))
        if args.verbose:
            print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            for i, mtype in enumerate(sub_mtypes):

                if args.verbose and (i % 200) == 100:
                    print("\nchecked {} sub-types\n".format(i))

                # ...we remove each one whose set of mutated samples is
                # identical to that of a sub-type that was already found
                if mtype_sampsets[mtype] in use_sampsets:
                    if args.verbose:
                        print("Removing functionally duplicate MuType {}"\
                                .format(mtype))

                else:
                    use_mtypes.update({mtype})
                    use_sampsets.update({mtype_sampsets[mtype]})

    if args.verbose:
        print("\nFound {} total sub-types!".format(len(use_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(
        sorted(list(use_mtypes)),
        open(os.path.join(
            out_path, 'mtype_list__freq_{}__levels_{}.p'.format(
                args.freq_cutoff, args.mut_levels)
            ), 'wb')
        )

    pickle.dump({'Samps': cdata.samples},
                open(os.path.join(out_path, 'cohort_info.p'), 'wb'))

    with open(os.path.join(
            out_path,
            'mtype_count__freq_{}__levels_{}.txt'.format(
                args.freq_cutoff, args.mut_levels)), 'w') as fl:

        fl.write(str(len(use_mtypes)))

Пример #5

Показать файл

def main(argv):
    """Runs the experiment."""
    syn = synapseclient.Synapse()
    syn.login()

    # load drug-mutation association data,
    # filter for pan-cancer associations
    drug_mut_assoc = pd.read_csv(base_dir +
                                 '/../../data/drugs/ioria/drug_anova.txt.gz',
                                 sep='\t',
                                 comment='#')

    if patient_cohs[argv[0]] in drug_mut_assoc.columns:
        drug_mut_assoc = drug_mut_assoc.ix[
            drug_mut_assoc[patient_cohs[argv[0]]] != 0, :]

    else:
        drug_mut_assoc = drug_mut_assoc.ix[drug_mut_assoc['PANCAN'] != 0, :]

    # categorize associations by mutation type
    pnt_indx = drug_mut_assoc['FEAT'].str.contains('_mut$')
    # TODO: determine how iorio handled CNVs (they're currently ignored)
    cnv_indx = drug_mut_assoc['FEAT'].str.contains('^(?:loss|gain):')
    fus_indx = drug_mut_assoc['FEAT'].str.contains('_fusion$')

    # get list of genes affected by point mutations, load TCGA cohort
    # with corresponding set of mutations
    pnt_genes = list(
        set(x[0] for x in drug_mut_assoc['FEAT'][pnt_indx].str.split('_')))
    print(len(pnt_genes))

    # create a VariantCohort with expression only for genes which have
    # point mutations in the drug_mut_assoc dataframe
    # (cv_prop = cross validation proportion)(train on all here)
    # cross val seed is provided as last arg in an HTCondor submit script, and
    # cohort name is the first (should match cohort names as they appear in BMEG)
    tcga_var_coh = VariantCohort(syn,
                                 cohort="TCGA-{}".format(
                                     patient_cohs[argv[0]]),
                                 mut_genes=pnt_genes,
                                 mut_levels=['Gene', 'Type'],
                                 cv_seed=int(argv[-1]) + 1,
                                 cv_prop=1)

    tcga_back_cohs = {
        coh: VariantCohort(syn,
                           cohort=coh,
                           mut_genes=pnt_genes,
                           mut_levels=['Gene', 'Type'],
                           cv_seed=int(argv[-1]) + 1,
                           cv_prop=1)
        for coh in tcga_backcohs
    }

    # TODO: recall why frameshifts aren't considered below
    # get list of point mutation types and drugs associated with at least one
    pnt_mtypes = [
        MuType({('Gene', gn): {
                    ('Type', ('Frame', 'Point')): None
                }}) for gn in pnt_genes
    ]
    pnt_muts = {
        (gn + '_mut'): mtype
        for gn, mtype in zip(pnt_genes, pnt_mtypes)
        # TODO: the get_samples argument should be a MuTree...right?
        if len(mtype.get_samples(tcga_var_coh.train_mut)) >= 5
    }
    pnt_drugs = list(
        set(drug_mut_assoc['DRUG'][pnt_indx][
            drug_mut_assoc['FEAT'][pnt_indx].isin(pnt_muts.keys())]))
    pnt_drugs.sort()
    print(len(pnt_drugs))

    # ... stores predicted drug responses for cell lines and tcga samples
    ccle_response = {}
    tcga_response = {}
    back_tcga_resp = {coh: {} for coh in tcga_backcohs}

    # ... stores predicted drug response for organoid sample
    patient_response = pd.Series(float('nan'), index=pnt_drugs)

    # array that stores classifier performance on held-out cell lines
    clf_perf = pd.Series(float('nan'), index=pnt_drugs)

    # ... stores t-test p-values for mutation state vs predicted
    # drug responses in TCGA cohort
    tcga_ttest = pd.DataFrame(float('nan'),
                              index=pnt_drugs,
                              columns=pnt_muts.keys())

    # ... stores AUC scores for mutation vs drug response in TCGA
    tcga_auc = pd.DataFrame(float('nan'),
                            index=pnt_drugs,
                            columns=pnt_muts.keys())

    # loads patient (or patient-derived model (PDM)) RNAseq data
    patient_expr = pd.read_csv(patient_files[argv[0]], header=0, sep='\t')

    # get rid of the unnecessary info in gene_id, get Hugo symbols
    patient_expr['gene_id'] = [
        i.split('^')[1] for i in patient_expr['gene_id']
    ]
    annot_data = get_gencode()
    patient_expr['Symbol'] = [
        annot_data[gn]['gene_name'] if gn in annot_data else 'no_gene'
        for gn in patient_expr['gene_id']
    ]

    # ensure that there are no zeros in preparation for log normalization
    patient_expr.loc[:, 'FPKM'] = (
        patient_expr.loc[:, 'FPKM'] +
        min(patient_expr.loc[:, 'FPKM'][patient_expr.loc[:, 'FPKM'] > 0]) / 2)
    # log normalize the FPKM values
    patient_expr.loc[:, 'FPKM'] = np.log2(patient_expr.loc[:, 'FPKM'])

    # combine multiple entries of same gene symbol (use their mean)
    patient_expr = patient_expr.groupby(['Symbol'])['FPKM'].mean()
    patient_expr = pd.DataFrame(patient_expr)

    for drug in pnt_drugs:
        drug_clf = eval(argv[1])()
        cell_line_drug_coh = DrugCohort(cohort='ioria',
                                        drug_names=[drug],
                                        cv_seed=int(argv[-1]))
        drug_lbl = cell_line_drug_coh.train_resp.columns[0]
        print("Testing drug {} with alias {} ...".format(drug, drug_lbl))

        # TODO: 'Symbol' --> gene_id
        # get the union of genes in all 3 datasets (tcga, ccle, patient/PDM RNAseq
        use_genes = (set(tcga_var_coh.genes) & set(cell_line_drug_coh.genes)
                     & set(patient_expr.index)
                     & reduce(lambda x, y: x & y,
                              [coh.genes for coh in tcga_back_cohs.values()]))

        # filter patient (or PDM) RNAseq data to include only use_genes
        patient_expr_filt = patient_expr.loc[use_genes, :]

        # TODO: does patient_expr_filtered need to be transposed?

        # tunes and fits the classifier on the CCLE data, and evaluates its
        # performance on the held-out samples
        pr = cProfile.Profile()
        pr.enable()
        drug_clf.tune_coh(cell_line_drug_coh,
                          pheno=drug_lbl,
                          tune_splits=4,
                          test_count=16,
                          include_genes=use_genes)
        drug_clf.fit_coh(cell_line_drug_coh,
                         pheno=drug_lbl,
                         include_genes=use_genes)
        pr.disable()
        s = io.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())
        print(drug_clf)
        clf_perf[drug] = drug_clf.eval_coh(cell_line_drug_coh,
                                           pheno=drug_lbl,
                                           include_genes=use_genes)

        # predicts drug response for the patient or PDM, stores classifier
        # for later use
        ccle_response[drug] = pd.Series(
            drug_clf.predict_train(cell_line_drug_coh,
                                   include_genes=use_genes))
        tcga_response[drug] = pd.Series(
            drug_clf.predict_train(tcga_var_coh, include_genes=use_genes))

        for coh in tcga_backcohs:
            back_tcga_resp[coh][drug] = pd.Series(
                drug_clf.predict_train(tcga_back_cohs[coh],
                                       include_genes=use_genes))

        patient_response[drug] = drug_clf.predict(
            patient_expr_filt.transpose())[0]

        for gn, mtype in pnt_muts.items():
            print("Gene: {}, Drug: {}".format(gn, drug))
            # for each mutated gene, get the vector of mutation status
            # for the TCGA samples
            mut_stat = np.array(tcga_var_coh.train_pheno(mtype=mtype))

            # gets the classifier's predictions of drug response for the
            # TCGA cohort, and evaluate its concordance with mutation status
            tcga_ttest.loc[drug, gn] = -log10(
                ttest_ind(tcga_response[drug][mut_stat],
                          tcga_response[drug][~mut_stat],
                          equal_var=False)[1])
            tcga_auc.loc[drug, gn] = roc_auc_score(mut_stat,
                                                   tcga_response[drug])

    # save everything to file
    out_data = {
        'Performance': clf_perf,
        'CCLE_Response': ccle_response,
        'TCGA_Response': tcga_response,
        'back_TCGA_Response': back_tcga_resp,
        'Patient_Response': patient_response,
        'TCGA_ttest': tcga_ttest,
        'TCGA_AUC': tcga_auc
    }
    out_file = ('/home/users/grzadkow/compbio/bergamot/HetMan/experiments/'
                'drug_predictions/output/mat_' + argv[0] + '_' + argv[1] +
                '__run' + argv[-1] + '.p')
    pickle.dump(out_data, open(out_file, 'wb'))

Пример #6

Показать файл

def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description=("Test a classifier's ability to create a mutation "
                     "signature for a gene that can be transferred from a "
                     "TCGA cohort to ICGC PACA-AU.")
        )

    parser.add_argument('classif', type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('mtypes', type=str,
                        help='a list of mutation types to test')

    parser.add_argument('cv_id', type=int,
                        help='a random seed used for cross-validation')
    parser.add_argument('task_id', type=int,
                        help=('the subset of TCGA cohorts and mutated genes '
                              'to assign to this task'))

    parser.add_argument(
        '--tune_splits', type=int, default=4,
        help='how many training cohort splits to use for tuning'
        )
    parser.add_argument(
        '--test_count', type=int, default=24,
        help='how many hyper-parameter values to test in each tuning split'
        )
    parser.add_argument(
        '--parallel_jobs', type=int, default=8,
        help='how many parallel CPUs to allocate the tuning tests across'
        )

    parser.add_argument('--verbose', '-v', action='store_true',
                        help='turns on diagnostic messages')

    args = parser.parse_args()
    if args.verbose:
        print("Starting ICGC transfer test with classifier {} on mutation "
              "type list `{}` for cross-validation ID {} and "
              "task ID {} ...".format(args.classif, args.mtypes,
                                      args.cv_id, args.task_id))

    cohort_mtypes = sorted(pickle.load(
        open(os.path.join(base_dir, 'setup',
                          'cohort_{}.p'.format(args.mtypes)),
             'rb')))

    test_count = ceil(len(cohort_mtypes) / 6)
    cohort_mtypes = [x for i, x in enumerate(cohort_mtypes)
                     if i // test_count == args.task_id]

    use_cohorts = set(coh for coh, _ in cohort_mtypes)
    mut_clf = eval(args.classif)

    out_acc = {cohort: dict() for cohort in use_cohorts}
    out_par = {cohort: dict() for cohort in use_cohorts}

    cdata_icgc = ICGCcohort('PACA-AU', icgc_data_dir, mut_genes=None,
                            samp_cutoff=[1/12, 11/12], cv_prop=1.0)

    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/mgrzad"
                                "/input-data/synapse")
    syn.login()
    
    for cohort in use_cohorts:
        cur_mtypes = [mtype for coh, mtype in cohort_mtypes if coh == cohort]

        if args.mtypes == 'genes':
            cur_genes = cur_mtypes.copy()
            cur_mtypes = [MuType({('Gene', gn): None}) for gn in cur_genes]

        else:
            cur_genes = reduce(
                or_,
                [set(gn for gn, _ in mtype.subtype_list())
                 for mtype in cur_mtypes]
                )

        tcga_cdata = TCGAcohort(
            cohort=cohort, mut_genes=cur_genes,
            mut_levels=['Gene', 'Form_base'],
            expr_source='toil', expr_dir=toil_dir, var_source='mc3', syn=syn,
            collapse_txs=True, cv_prop=0.75, cv_seed=(args.cv_id - 37) * 101
            )

        if args.verbose:
            print("Loaded mutations for {} genes in cohort {} with "
                  "{} samples.".format(len(cur_genes), cohort,
                                       len(tcga_cdata.samples)))

        for mtype in cur_mtypes:
            if args.verbose:
                print("Testing {} in {} ...".format(mtype, cohort))

            clf = mut_clf()
            use_genes = ((cdata_icgc.genes & tcga_cdata.genes)
                         - set(gn for gn, _ in mtype.subtype_list()))

            clf.tune_coh(tcga_cdata, mtype, include_genes=use_genes,
                         tune_splits=args.tune_splits,
                         test_count=args.test_count,
                         parallel_jobs=args.parallel_jobs)
            out_par[cohort][mtype] = {par: clf.get_params()[par]
                                      for par, _ in clf.tune_priors}

            clf.fit_coh(tcga_cdata, mtype, include_genes=use_genes)
            out_acc[cohort][mtype] = clf.eval_coh(
                cdata_icgc, mtype, include_genes=use_genes,
                use_train=True
                )

    out_file = os.path.join(base_dir, 'output', args.classif, args.mtypes,
                            'out__cv-{}_task-{}.p'.format(
                                args.cv_id, args.task_id)
                            )

    pickle.dump({'Acc': out_acc, 'Par': out_par,
                 'Info': {'TuneSplits': args.tune_splits,
                          'TestCount': args.test_count,
                          'ParallelJobs': args.parallel_jobs}},
                open(out_file, 'wb'))

Пример #7

Показать файл

def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up searching for sub-types to detect.')

    # positional command line arguments
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif',
                        type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('base_gene',
                        type=str,
                        help='a gene to cross with respect to')

    # optional command line arguments controlling the thresholds for which
    # individual mutations and how many genes' mutations are considered
    parser.add_argument('--freq_cutoff',
                        type=int,
                        default=20,
                        help='sub-type sample frequency threshold')
    parser.add_argument('--max_genes',
                        type=int,
                        default=200,
                        help='maximum number of mutated genes to consider')

    # optional command line arguments for what kinds of mutation sub-types to
    # look for in terms of properties and number of mutations to combine
    parser.add_argument(
        '--mut_levels',
        type=str,
        nargs='+',
        default=['Form_base', 'Exon', 'Protein'],
        help='the mutation property levels to consider in addition to `Genes`')
    parser.add_argument(
        '--comb_size',
        type=int,
        default=2,
        help='maximum number of individual mutations to combine'
        'when searching for mutation sub-types')

    # optional command line argument controlling verbosity
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'output', args.cohort, args.classif,
                            'cross', args.base_gene)

    if args.verbose:
        print("Looking for mutation sub-types in cohort {} composed of at "
              "most {} individual mutations with at least {} "
              "samples in total.\n".format(args.cohort, args.comb_size,
                                           args.freq_cutoff))

    # log into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # load the expression matrix for the given cohort from Broad Firehose,
    # load the MC3 variant call set from Synapse, find the mutations for the
    # samples that are in both datasets
    expr_data = get_expr_firehose(args.cohort, firehose_dir)
    mc3_data = get_variants_mc3(syn)
    expr_mc3 = mc3_data.loc[mc3_data['Sample'].isin(expr_data.index), :]

    # get the genes whose mutations appear in enough samples to pass the
    # frequency threshold
    gene_counts = expr_mc3.groupby(by='Gene').Sample.nunique()
    count_cutoff = int(args.freq_cutoff / args.comb_size)
    common_genes = set(gene_counts.index[gene_counts >= count_cutoff])

    if args.verbose:
        print("Found {} candidate genes with at least {} potential "
              "mutated samples.".format(len(common_genes), count_cutoff))

    if len(common_genes) >= args.max_genes:
        gene_counts = gene_counts[common_genes].sort_values(ascending=False)
        common_genes = set(gene_counts[:args.max_genes].index)

        if args.verbose:
            print("Too many genes found, culling list to {} genes which each "
                  "have at least {} mutated samples.".format(
                      args.max_genes, min(gene_counts[common_genes])))

    cdata = VariantCohort(cohort=args.cohort,
                          mut_genes=common_genes,
                          mut_levels=['Gene'] + args.mut_levels,
                          expr_source='Firehose',
                          data_dir=firehose_dir,
                          cv_prop=1.0,
                          syn=syn)

    base_mtype = MuType({('Gene', args.base_gene): None})
    base_samps = base_mtype.get_samples(cdata.train_mut)

    with_muts = deepcopy(cdata.train_mut).subtree(base_samps)
    without_muts = deepcopy(cdata.train_mut).subtree(cdata.samples -
                                                     base_samps)

    # intializes the list of found sub-types and the list of samples each
    # sub-type appears in
    use_mtypes = set()
    use_sampsets = set()

    search_level = 1
    break_status = False

    # until we have not reached the limit of sub-type enumeration or run out
    # property level combinations to test...
    while (len(use_mtypes) < 10000 and not break_status
           and search_level <= 2**len(args.mut_levels)):

        # try a list of property level combinations and number of individual
        # variants to combine, where the complexity of the level combination
        # plus the variant count is held constant
        for lvl_combn, comb_size in zip(
                rev_powerset_slice(args.mut_levels, search_level),
                range(1, min(search_level + 1, args.comb_size + 1))):
            use_lvls = ['Gene'] + list(lvl_combn)

            if args.verbose:
                print("\nLooking for sub-types that are combinations "
                      "of {} mutation(s) at levels {}...\n".format(
                          comb_size, use_lvls))

            # enumerates the sub-types consisting of a combination of the given
            # number of individual mutations at the given property levels
            sub_mtypes = with_muts.combtypes(comb_sizes=(comb_size, ),
                                             sub_levels=use_lvls,
                                             min_type_size=int(
                                                 args.freq_cutoff / 2))
            sub_mtypes |= without_muts.combtypes(comb_sizes=(comb_size, ),
                                                 sub_levels=use_lvls,
                                                 min_type_size=int(
                                                     args.freq_cutoff / 2))

            # finds the samples belonging to each enumerated sub-type that
            # hasn't already been found
            mtype_sampsets = {
                mtype: frozenset(mtype.get_samples(cdata.train_mut))
                for mtype in sub_mtypes - use_mtypes
                if (mtype & base_mtype).is_empty()
            }

            # removes the sub-types with so many mutated samples that there
            # are not enough negatively-labelled samples for classification
            mtype_sampsets = {
                mtype: sampset
                for mtype, sampset in mtype_sampsets.items()
                if len(sampset) <= (len(cdata.samples) - args.freq_cutoff)
            }

            sub_mtypes = sorted(list(mtype_sampsets))
            if args.verbose:
                print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            # if the list of remaining sub-types isn't too long...
            if len(sub_mtypes) < 8000:
                add_mtypes = set()

                for i, mtype in enumerate(sub_mtypes):
                    if args.verbose and (i % 200) == 100:
                        print("\nchecked {} sub-types\n".format(i))

                    # ...we remove each one whose set of mutated samples is
                    # identical to that of a sub-type that was already found
                    if mtype_sampsets[mtype] in use_sampsets:
                        if args.verbose:
                            print("Removing functionally duplicate MuType {}"\
                                    .format(mtype))

                    else:
                        add_mtypes.update({mtype})
                        use_sampsets.update({mtype_sampsets[mtype]})

                use_mtypes |= add_mtypes

            elif len(sub_mtypes) > 100000:
                break_status = True

        search_level += 1

    if args.verbose:
        print("\nFound {} total sub-types!".format(len(use_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(sorted(list(use_mtypes)),
                open(os.path.join(out_path, 'tmp/mtype_list.p'), 'wb'))