示例#1
0
def data_model(data_type):
    """Load data model and sample info data"""
    # passing arguments to fixtures (like data_type here), then using them
    # in tests isn't widely documented in pytest, but seems to work
    # see, e.g. https://stackoverflow.com/a/60148972
    tcga_data = TCGADataModel(training_data=data_type, debug=True, test=True)
    sample_info_df = du.load_sample_info(train_data_type=data_type)
    return tcga_data, sample_info_df
示例#2
0
def generate_data_model(data_type, verbose=False):
    """Load data model and sample info data"""
    tcga_data = TCGADataModel(training_data=data_type,
                              test=True,
                              verbose=verbose)
    sample_info_df = du.load_sample_info(train_data_type=data_type,
                                         verbose=verbose)
    return tcga_data, sample_info_df
示例#3
0
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types
    model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type
    model_options.training_data = 'expression'
    model_options.overlap_data_types = ['expression']
    model_options.bc_titration = True

    return io_args, model_options


if __name__ == '__main__':

    # process command line arguments
    io_args, model_options = process_args()
    sample_info_df = du.load_sample_info(model_options.training_data,
                                         verbose=io_args.verbose)

    # create results dir and subdir for experiment if they don't exist
    experiment_dir = Path(io_args.results_dir, 'gene').resolve()
    experiment_dir.mkdir(parents=True, exist_ok=True)

    # save model options for this experiment
    # (hyperparameters, preprocessing info, etc)
    fu.save_model_options(experiment_dir, model_options)

    # create empty log file if it doesn't exist
    log_columns = ['gene', 'titration_ratio', 'shuffle_labels', 'skip_reason']

    tcga_data = TCGADataModel(
        seed=model_options.seed,
        subset_mad_genes=model_options.subset_mad_genes,
示例#4
0
sns.boxplot(data=plot_df,
            x='signal',
            y='auroc',
            hue='training_data',
            ax=axarr[1])
axarr[1].set_title(
    'Binarized tumor purity prediction performance, by data type')
axarr[1].set_xlabel('Signal or shuffled')
axarr[1].set_ylabel('AUROC')
axarr[1].legend(title='Data type')

# ### Plot results faceted by cancer type

# In[5]:

sample_info_df = du.load_sample_info('expression')
results_df = au.load_purity_by_cancer_type(results_dir, sample_info_df)
print(results_df.training_data.unique())
results_df.head()

# In[6]:

top_cancer_types = (sample_info_df.groupby('cancer_type').count().drop(
    columns=['id_for_stratification']).rename(columns={
        'sample_type': 'count'
    }).sort_values(by='count', ascending=False))
top_cancer_types.head()

# In[7]:

sns.set({'figure.figsize': (15, 12)})
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument('--cancer_types',
                    nargs='*',
                    help='cancer types to predict, if not included predict '
                    'all cancer types in TCGA')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped cancer types to')
    io.add_argument('--output_preds', action='store_true')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['cancer_type'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument(
        '--subset_mad_genes',
        type=int,
        default=cfg.num_features_raw,
        help='if included, subset gene features to this number of '
        'features having highest mean absolute deviation')
    opts.add_argument('--training_data',
                      type=str,
                      default='expression',
                      choices=list(cfg.data_types.keys()),
                      help='what data type to train model on')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    # check that all provided cancer types are valid TCGA acronyms
    sample_info_df = du.load_sample_info(args.training_data, args.verbose)
    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))

    if args.cancer_types is None:
        args.cancer_types = tcga_cancer_types
    else:
        not_in_tcga = set(args.cancer_types) - set(tcga_cancer_types)
        if len(not_in_tcga) > 0:
            parser.error('some cancer types not present in TCGA: {}'.format(
                ' '.join(not_in_tcga)))

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.n_dim = None
    model_options.alphas = cfg.alphas
    model_options.l1_ratios = cfg.l1_ratios
    model_options.standardize_data_types = cfg.standardize_data_types

    # add information about valid samples to model options
    model_options.sample_overlap_data_types = list(
        get_overlap_data_types(use_subsampled=model_options.debug).keys())

    return io_args, model_options, sample_info_df
示例#6
0
    return subsample_df


if __name__ == '__main__':
    p = argparse.ArgumentParser()
    p.add_argument('--dataset',
                   type=str,
                   choices=['expression', 'me_27k', 'all'],
                   default='all')
    p.add_argument('--verbose', action='store_true')
    args = p.parse_args()

    cfg.subsampled_data_dir.mkdir(parents=True, exist_ok=True)

    if args.dataset in ['expression', 'all']:
        sample_info_df = du.load_sample_info(train_data_type='expression',
                                             verbose=args.verbose)
        rnaseq_df = du.load_raw_data(train_data_type='expression',
                                     verbose=args.verbose)
        if args.verbose:
            print('Generating subsampled expression data...', end='')
        subsample_df = subsample_stratified(rnaseq_df, sample_info_df)
        subsample_df.to_csv(cfg.subsampled_expression,
                            sep='\t',
                            compression='gzip',
                            float_format='%.3g')
        if args.verbose:
            print('done')

    if args.dataset in ['me_27k', 'all']:
        sample_info_df = du.load_sample_info(train_data_type='me_27k',
                                             verbose=args.verbose)
示例#7
0
def calculate_gene_count(overlap_data_types, seeds, num_folds):
    """For a set of data types, calculate the number of valid genes."""
    gene_seed_list = []
    sample_info_df = du.load_sample_info('expression')
    for seed in seeds:
        tcga_data = TCGADataModel(seed=seed,
                                  overlap_data_types=overlap_data_types)
        genes_df = tcga_data.load_gene_set('vogelstein')
        for gene_ix, gene_series in genes_df.iterrows():
            
            print(gene_series.gene, file=sys.stderr)
            try:
                tcga_data.process_data_for_gene(gene_series.gene,
                                                gene_series.classification,
                                                None)
            except KeyError: continue
            y_ones = np.count_nonzero(tcga_data.y_df.status)
            y_zeroes = len(tcga_data.y_df.status) - y_ones
            print(y_ones, y_zeroes, file=sys.stderr)
            
            # check if any valid cancer types, if not break
            if tcga_data.X_df.shape[0] == 0:
                gene_seed_list.append((gene_series.gene, seed, False, 'no_valid_cancer_types'))
                continue
                
            # subset features to speed up CV
            tcga_data.X_df = tcga_data.X_df.iloc[:, :50]
                
            # if valid cancer types, look at CV folds and make sure each
            # has 0 and 1 labels
            gene_seed_valid = True
            reason = 'N/A'
            for fold_no in range(num_folds):
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore',
                                            message='The least populated class in y')
                    X_train, X_test, _ = cv.split_stratified(
                        tcga_data.X_df,
                        sample_info_df,
                        num_folds=num_folds,
                        fold_no=fold_no,
                        seed=seed
                    )
                y_train = tcga_data.y_df.reindex(X_train.index)
                y_test = tcga_data.y_df.reindex(X_test.index)
                
                # count 0/1 labels in y_train and y_test
                y_train_ones = np.count_nonzero(y_train.status)
                y_train_zeroes = len(y_train.status) - y_train_ones
                y_test_ones = np.count_nonzero(y_test.status)
                y_test_zeroes = len(y_test.status) - y_test_ones
                print(fold_no, y_train_ones, y_train_zeroes, y_test_ones, y_test_zeroes,
                      file=sys.stderr)
                
                if ((y_train_ones == 0) or (y_train_zeroes == 0)):
                    gene_seed_valid = False
                    reason = 'one_train_class'
                    break
                elif ((y_test_ones == 0) or (y_test_zeroes == 0)):
                    gene_seed_valid = False
                    reason = 'one_test_class'
                    break
                    
            gene_seed_list.append((gene_series.gene, seed, gene_seed_valid, reason))
                
    return gene_seed_list
示例#8
0
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import mpmp.config as cfg
import mpmp.utilities.data_utilities as du

# In[2]:

DATA_TYPE = 'mut_sigs'

# load gene/classification info and sample/cancer type info
print('Loading gene label data...', file=sys.stderr)
genes_df = du.load_vogelstein()
sample_info_df = du.load_sample_info(DATA_TYPE, verbose=True)

# load mutation info
# this returns a tuple of dataframes, unpack it below
pancancer_data = du.load_pancancer_data(verbose=True)
(sample_freeze_df, mutation_df, copy_loss_df, copy_gain_df,
 mut_burden_df) = pancancer_data

# In[3]:

# load relevant data
data_df = du.load_raw_data(DATA_TYPE, verbose=True)

# standardize columns of expression dataframe
if DATA_TYPE in cfg.standardize_data_types:
    print('Standardizing columns of {} data...'.format(DATA_TYPE),
示例#9
0
# this is the number of valid genes in the Vogelstein gene set
NUM_GENES = 85

# sample random genes from set of genes with every gene with >= NUM_CANCERS
# valid cancer types
#
# if we sampled them randomly from all genes, it's likely that many of them
# would end up with no valid cancer types (i.e. not enough mutations to train
# a classifier), so we add this criterion to make sure they have at least one
NUM_CANCERS = 1

# ### Load mutation and sample/cancer type info

# In[3]:

sample_info_df = du.load_sample_info('expression', verbose=True)
pancancer_data = du.load_pancancer_data(verbose=True)
mutation_df = pancancer_data[1]
mut_burden_df = pancancer_data[4]
print(sample_info_df.shape)
print(mutation_df.shape)
print(mut_burden_df.shape)

# In[4]:

# merge sample info and mutation burden info
hyper_filter = 5

print(mutation_df.shape)

mutations_df = (mutation_df.merge(sample_info_df,
示例#10
0
                           compare_results_df['nlog10_p'],
                           compare_results_df.identifier,
                           compare_results_df.reject_null, plt.gca())
adjust_text(text_labels, ax=plt.gca())

# ## Confusion matrix

# In[10]:

import os

import mpmp.utilities.data_utilities as du

preds_dir = os.path.join(cfg.results_dirs['cancer_type'], 'results_preds',
                         'cancer_type')
sample_info_df = du.load_sample_info()

preds_expression_df = au.load_preds_to_matrix(preds_dir,
                                              sample_info_df,
                                              training_data='expression')
print(preds_expression_df.shape)
preds_expression_df.iloc[:5, :5]

# In[11]:

sns.set({'figure.figsize': (15, 10)})
ax = sns.heatmap(
    preds_expression_df,
    cbar_kws={
        'label':
        'Predicted probability of positive label, averaged over samples'
示例#11
0
def process_args():
    """Parse and format command line arguments."""

    parser = argparse.ArgumentParser()

    # argument group for parameters related to input/output
    # (e.g. filenames, logging/verbosity options, target genes)
    #
    # these don't affect the model output, and thus don't need to be saved
    # with the results of the experiment
    io = parser.add_argument_group(
        'io', 'arguments related to script input/output, '
        'note these will *not* be saved in metadata ')
    io.add_argument(
        '--cancer_types',
        nargs='*',
        default=['all_cancer_types'],
        help='cancer types to run, \'pancancer\' for a pan-cancer model '
        'combining cancer types, default is all individual TCGA '
        'cancer types + pan-cancer model')
    io.add_argument('--log_file',
                    default=None,
                    help='name of file to log skipped cancer types to')
    io.add_argument('--output_survival_fn', action='store_true')
    io.add_argument('--results_dir',
                    default=cfg.results_dirs['survival'],
                    help='where to write results to')
    io.add_argument('--verbose', action='store_true')

    # argument group for parameters related to model training/evaluation
    # (e.g. model hyperparameters, preprocessing options)
    #
    # these affect the output of the model, so we want to save them in the
    # same directory as the experiment results
    opts = parser.add_argument_group(
        'model_options', 'parameters for training/evaluating model, '
        'these will affect output and are saved as '
        'experiment metadata ')
    opts.add_argument('--debug',
                      action='store_true',
                      help='use subset of data for fast debugging')
    opts.add_argument(
        '--fit_ridge',
        action='store_true',
        help='if included, fit ridge-regularized survival model instead '
        'of elastic net model. this tends to converge slightly faster '
        'and more robustly on smaller feature sets, but may fit slowly '
        'or not at all on large sets of features')
    opts.add_argument(
        '--n_dim',
        default=None,
        help='number of compressed components/dimensions to use, '
        'None to use raw features')
    opts.add_argument('--num_folds',
                      type=int,
                      default=4,
                      help='number of folds of cross-validation to run')
    opts.add_argument('--overlap_data_types',
                      nargs='*',
                      default=['expression'],
                      help='data types to define set of samples to use; e.g. '
                      'set of data types for a model comparison, use only '
                      'overlapping samples from these data types')
    opts.add_argument('--seed', type=int, default=cfg.default_seed)
    opts.add_argument(
        '--subset_mad_genes',
        type=int,
        default=cfg.num_features_raw,
        help='if included, subset gene features to this number of '
        'features having highest mean absolute deviation')
    opts.add_argument('--training_data',
                      type=str,
                      default='expression',
                      choices=list(cfg.data_types.keys()) + ([
                          'baseline', 'vogelstein_mutations',
                          'significant_mutations', 'mutation_preds_expression',
                          'mutation_preds_me_27k', 'mutation_preds_me_450k'
                      ]),
                      help='what data type to train model on')

    args = parser.parse_args()

    args.results_dir = Path(args.results_dir).resolve()

    if args.log_file is None:
        args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

    if args.n_dim is not None:
        args.n_dim = int(args.n_dim)

    if args.training_data == 'baseline':
        sample_info_df = (du.load_sample_info('expression',
                                              verbose=args.verbose))
    else:
        sample_info_df = (du.load_sample_info(args.training_data,
                                              verbose=args.verbose))

    tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))
    tcga_cancer_types.append('pancancer')
    if 'all_cancer_types' in args.cancer_types:
        args.cancer_types = tcga_cancer_types
    else:
        not_in_tcga = set(args.cancer_types) - set(tcga_cancer_types)
        if len(not_in_tcga) > 0:
            parser.error('some cancer types not present in TCGA: {}'.format(
                ' '.join(not_in_tcga)))

    # check that all data types in overlap_data_types are valid
    check_all_data_types(parser, args.overlap_data_types, args.debug)

    # split args into defined argument groups, since we'll use them differently
    arg_groups = du.split_argument_groups(args, parser)
    io_args, model_options = arg_groups['io'], arg_groups['model_options']

    # add some additional hyperparameters/ranges from config file to model options
    # these shouldn't be changed by the user, so they aren't added as arguments
    model_options.max_iter = cfg.max_iter_map['survival']
    model_options.alphas = cfg.alphas_map['survival']
    model_options.l1_ratios = cfg.l1_ratios_map['survival']
    model_options.standardize_data_types = cfg.standardize_data_types
    model_options.shuffle_by_cancer_type = cfg.shuffle_by_cancer_type

    return io_args, model_options, sample_info_df
示例#12
0
    def _load_data(self,
                   train_data_type,
                   compressed_data=False,
                   standardize_input=False,
                   n_dim=None,
                   sample_info_df=None,
                   debug=False,
                   test=False):
        """Load and store relevant data.

        This data does not vary based on the gene/cancer type being considered
        (i.e. it can be loaded only once when the class is instantiated).

        Arguments:
        ----------
        debug (bool): whether or not to subset data for faster debugging
        test (bool): whether or not to subset columns in mutation data, for testing
        """
        # first load and unpack pancancer mutation/CNV/TMB data
        # this data is described in more detail in the load_pancancer_data docstring
        if test:
            # for testing, just load a subset of pancancer data,
            # this is much faster than loading mutation data for all genes
            import mpmp.test_config as tcfg
            pancan_data = du.load_pancancer_data(
                verbose=self.verbose,
                test=True,
                subset_columns=tcfg.test_genes)
        else:
            pancan_data = du.load_pancancer_data(verbose=self.verbose)

        (self.sample_freeze_df, self.mutation_df, self.copy_loss_df,
         self.copy_gain_df, self.mut_burden_df) = pancan_data

        # now load training data
        if not isinstance(train_data_type, str):
            # if a list of train data types is provided, we have to load each
            # of them and concatenate columns
            # n_dim should be a list here
            self.data_df, self.data_types = du.load_multiple_data_types(
                train_data_type,
                n_dims=n_dim,
                standardize_input=standardize_input,
                verbose=self.verbose)
        elif compressed_data:
            self.data_df = du.load_compressed_data(
                train_data_type,
                n_dim=n_dim,
                verbose=self.verbose,
                standardize_input=standardize_input,
                load_subset=(debug or test))
        elif train_data_type == 'baseline':
            # we just want to use non-omics covariates as a baseline
            # so here, get sample list for expression data, then create an
            # empty data frame using it as an index
            if sample_info_df is None:
                sample_info_df = du.load_sample_info('expression',
                                                     verbose=self.verbose)
            self.data_df = pd.DataFrame(index=sample_info_df.index)
        else:
            if train_data_type == 'vogelstein_mutations':
                self.data_df = self._load_vogelstein_mutation_matrix()
            elif train_data_type == 'significant_mutations':
                data_df = self._load_vogelstein_mutation_matrix()
                sig_genes = du.load_significant_genes('methylation')
                # startswith() with a tuple argument returns True if
                # the string matches any of the prefixes in the tuple
                # https://stackoverflow.com/a/20461857
                self.data_df = data_df.loc[:,
                                           data_df.columns.str.
                                           startswith(tuple(sig_genes))]
            elif 'mutation_preds' in train_data_type:
                self.data_df = du.load_mutation_predictions(train_data_type)
            else:
                self.data_df = du.load_raw_data(train_data_type,
                                                verbose=self.verbose,
                                                load_subset=(debug or test))

        if sample_info_df is None:
            self.sample_info_df = du.load_sample_info(train_data_type,
                                                      verbose=self.verbose)
        else:
            # sometimes we load sample info in the calling script as part of
            # argument processing, etc
            # in that case, we don't need to load it again
            self.sample_info_df = sample_info_df