def generate_data_model(data_type, verbose=False): """Load data model and sample info data""" tcga_data = TCGADataModel(training_data=data_type, test=True, verbose=verbose) sample_info_df = du.load_sample_info(train_data_type=data_type, verbose=verbose) return tcga_data, sample_info_df
def data_model(data_type): """Load data model and sample info data""" # passing arguments to fixtures (like data_type here), then using them # in tests isn't widely documented in pytest, but seems to work # see, e.g. https://stackoverflow.com/a/60148972 tcga_data = TCGADataModel(training_data=data_type, debug=True, test=True) sample_info_df = du.load_sample_info(train_data_type=data_type) return tcga_data, sample_info_df
# create results dir and subdir for experiment if they don't exist experiment_dir = Path(io_args.results_dir, 'gene').resolve() experiment_dir.mkdir(parents=True, exist_ok=True) # save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = ['gene', 'titration_ratio', 'shuffle_labels', 'skip_reason'] tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) genes_df = tcga_data.load_gene_set(io_args.gene_set) # we want to run mutation prediction experiments: # - for true labels and shuffled labels # (shuffled labels acts as our lower baseline) # - for all genes in the given gene set for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels)) outer_progress = tqdm(genes_df.iterrows(), total=genes_df.shape[0],
experiment_dir = Path(io_args.results_dir, 'cancer_type').resolve() experiment_dir.mkdir(parents=True, exist_ok=True) # save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty error log file if it doesn't exist log_columns = [ 'cancer_type', 'training_data', 'shuffle_labels', 'skip_reason' ] # load data matrix for the specified data type tcga_data = TCGADataModel(seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) # we want to run cancer type classification experiments: # - for true labels and shuffled labels # (shuffled labels acts as our lower baseline) # - for all cancer types in the given list of TCGA cancers for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels)) progress = tqdm(io_args.cancer_types, total=len(io_args.cancer_types), ncols=100, file=sys.stdout)
# create results dir and subdir for experiment if they don't exist experiment_dir = Path(io_args.results_dir, 'gene').resolve() experiment_dir.mkdir(parents=True, exist_ok=True) # save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = ['gene', 'training_data', 'shuffle_labels', 'skip_reason'] tcga_data = TCGADataModel( seed=model_options.seed, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) genes_df = tcga_data.load_gene_set(io_args.gene_set) # we want to run mutation prediction experiments: # - for true labels and shuffled labels # (shuffled labels acts as our lower baseline) # - for all genes in the given gene set for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels)) progress = tqdm(genes_df.iterrows(), total=genes_df.shape[0],
experiment_dir = Path(io_args.results_dir, 'gene').resolve() experiment_dir.mkdir(parents=True, exist_ok=True) # save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = ['gene', 'training_data', 'shuffle_labels', 'skip_reason'] tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, # standardize all data types standardize_input=[True] * len(model_options.training_data), n_dim=model_options.n_dim, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) genes_df = tcga_data.load_gene_set(io_args.gene_set) # we want to run mutation prediction experiments: # - for true labels and shuffled labels # (shuffled labels acts as our lower baseline) # - for all genes in the given gene set for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels))
# (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = ['gene', 'training_data', 'skip_reason'] if io_args.log_file.exists() and io_args.log_file.is_file(): log_df = pd.read_csv(io_args.log_file, sep='\t') else: log_df = pd.DataFrame(columns=log_columns) log_df.to_csv(io_args.log_file, sep='\t') tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, load_compressed_data=True, standardize_input=True, n_dim=N_DIM, sample_info_df=sample_info_df, verbose=io_args.verbose) genes_df = tcga_data.load_gene_set('vogelstein') progress = tqdm(genes_df.iterrows(), total=genes_df.shape[0], ncols=100, file=sys.stdout) all_genes = [] all_preds = [] sample_list = None
# (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options, classify=model_options.classify) # create empty log file if it doesn't exist log_columns = [ 'training_data', 'shuffle_labels', 'skip_reason' ] log_df = None tcga_data = TCGADataModel(seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, load_compressed_data=model_options.use_compressed, n_dim=model_options.n_dim, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) # we want to run purity prediction experiments for true labels and # shuffled labels (the latter as a lower baseline) progress = tqdm([False, True], ncols=100, file=sys.stdout) for shuffle_labels in progress: progress.set_description('shuffle labels: {}'.format(shuffle_labels)) try: output_dir = fu.make_output_dir(experiment_dir, '') check_file = fu.check_output_file(output_dir,
def calculate_gene_count(overlap_data_types, seeds, num_folds): """For a set of data types, calculate the number of valid genes.""" gene_seed_list = [] sample_info_df = du.load_sample_info('expression') for seed in seeds: tcga_data = TCGADataModel(seed=seed, overlap_data_types=overlap_data_types) genes_df = tcga_data.load_gene_set('vogelstein') for gene_ix, gene_series in genes_df.iterrows(): print(gene_series.gene, file=sys.stderr) try: tcga_data.process_data_for_gene(gene_series.gene, gene_series.classification, None) except KeyError: continue y_ones = np.count_nonzero(tcga_data.y_df.status) y_zeroes = len(tcga_data.y_df.status) - y_ones print(y_ones, y_zeroes, file=sys.stderr) # check if any valid cancer types, if not break if tcga_data.X_df.shape[0] == 0: gene_seed_list.append((gene_series.gene, seed, False, 'no_valid_cancer_types')) continue # subset features to speed up CV tcga_data.X_df = tcga_data.X_df.iloc[:, :50] # if valid cancer types, look at CV folds and make sure each # has 0 and 1 labels gene_seed_valid = True reason = 'N/A' for fold_no in range(num_folds): with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='The least populated class in y') X_train, X_test, _ = cv.split_stratified( tcga_data.X_df, sample_info_df, num_folds=num_folds, fold_no=fold_no, seed=seed ) y_train = tcga_data.y_df.reindex(X_train.index) y_test = tcga_data.y_df.reindex(X_test.index) # count 0/1 labels in y_train and y_test y_train_ones = np.count_nonzero(y_train.status) y_train_zeroes = len(y_train.status) - y_train_ones y_test_ones = np.count_nonzero(y_test.status) y_test_zeroes = len(y_test.status) - y_test_ones print(fold_no, y_train_ones, y_train_zeroes, y_test_ones, y_test_zeroes, file=sys.stderr) if ((y_train_ones == 0) or (y_train_zeroes == 0)): gene_seed_valid = False reason = 'one_train_class' break elif ((y_test_ones == 0) or (y_test_zeroes == 0)): gene_seed_valid = False reason = 'one_test_class' break gene_seed_list.append((gene_series.gene, seed, gene_seed_valid, reason)) return gene_seed_list
fu.save_model_options(experiment_dir, model_options) # create empty log file if it doesn't exist log_columns = [ 'cancer_type', 'training_data', 'shuffle_labels', 'skip_reason' ] log_df = None tcga_data = TCGADataModel(seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, load_compressed_data=model_options.use_compressed, n_dim=model_options.n_dim, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) # we want to run MSI prediction experiments for true labels and # shuffled labels (the latter as a lower baseline) for shuffle_labels in (False, True): print('shuffle labels: {}'.format(shuffle_labels)) progress = tqdm(io_args.cancer_types, total=len(io_args.cancer_types), ncols=100, file=sys.stdout)
# save model options for this experiment # (hyperparameters, preprocessing info, etc) fu.save_model_options(experiment_dir, model_options, 'survival') # create empty log file if it doesn't exist log_columns = [ 'cancer_type', 'training_data', 'shuffle_labels', 'skip_reason' ] tcga_data = TCGADataModel( seed=model_options.seed, subset_mad_genes=model_options.subset_mad_genes, training_data=model_options.training_data, overlap_data_types=model_options.overlap_data_types, load_compressed_data=(model_options.n_dim is not None), standardize_input=(model_options.n_dim is not None and model_options.training_data in cfg.standardize_data_types), n_dim=model_options.n_dim, sample_info_df=sample_info_df, verbose=io_args.verbose, debug=model_options.debug) # we want to run survival prediction experiments: # - for true labels and shuffled labels # (shuffled labels acts as our lower baseline) # - for all cancer types provided for shuffle_labels in (False, True): print('shuffle_labels: {}'.format(shuffle_labels))