except KeyError: continue handles, labels = plt.gca().get_legend_handles_labels() by_label = dict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys(), title='Stage at diagnosis') plt.xlabel('Time (days)') plt.ylabel('Predicted survival probability') # ### Color survival curves by cancer subtype # # Here, we color the survival curves by cancer subtype. Generally, there are pronounced differences in survival between subtypes of any given cancer, although the exact differences depend on the cancer type. # In[10]: sample_freeze_df = du.load_pancancer_data()[0] sample_freeze_df.head() # In[11]: subtypes = sample_freeze_df[sample_freeze_df.DISEASE == cancer_type].SUBTYPE.unique() subtype_to_ix = {st: ix for ix, st in enumerate(sorted(subtypes))} print(subtype_to_ix) # In[12]: sns.set({'figure.figsize': (10, 8)}) # color by stage at diagnosis for ix in range(len(fns)):
# In[2]: (cancer_types_df, cancertype_codes_dict, sample_types_df, sampletype_codes_dict) = tu.get_tcga_barcode_info() cancer_types_df.head(2) # In[3]: sample_types_df.head(2) # ### Load and process somatic mutation data # In[4]: pancan_data = du.load_pancancer_data(verbose=True) sample_freeze_df = pancan_data[0] print(sample_freeze_df.duplicated(['SAMPLE_BARCODE']).sum()) assert (sample_freeze_df.duplicated(['SAMPLE_BARCODE']).sum() == 0) sample_freeze_df.set_index('SAMPLE_BARCODE', inplace=True) sample_freeze_df.index.rename('sample_id', inplace=True) sample_freeze_df.head() # ### Process TCGA cancer type and sample type info from barcodes # # See https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes for more details. # In[5]:
# get sample list for each -omics data type sample_lists = {} for training_data, sample_info_file in cfg.sample_infos.items(): samples = pd.read_csv(sample_info_file, sep='\t', index_col=0).index try: sample_lists[data_map[training_data]] = set(samples) except KeyError: # bias-corrected results, ignore them here import sys print(training_data, file=sys.stderr) continue # In[4]: # add mutation data to sample list pancan_data = du.load_pancancer_data() (sample_freeze_df, mutation_df, copy_loss_df, copy_gain_df, mut_burden_df) = pancan_data print(sample_freeze_df.shape) print(mutation_df.shape) print(copy_loss_df.shape) print(copy_gain_df.shape) print(mut_burden_df.shape) # In[5]: # all these dfs contain the same samples, so just use one of the indexes sample_lists['mutation'] = set(mutation_df.index) # In[6]:
def _load_data(self, train_data_type, compressed_data=False, standardize_input=False, n_dim=None, sample_info_df=None, debug=False, test=False): """Load and store relevant data. This data does not vary based on the gene/cancer type being considered (i.e. it can be loaded only once when the class is instantiated). Arguments: ---------- debug (bool): whether or not to subset data for faster debugging test (bool): whether or not to subset columns in mutation data, for testing """ # first load and unpack pancancer mutation/CNV/TMB data # this data is described in more detail in the load_pancancer_data docstring if test: # for testing, just load a subset of pancancer data, # this is much faster than loading mutation data for all genes import mpmp.test_config as tcfg pancan_data = du.load_pancancer_data( verbose=self.verbose, test=True, subset_columns=tcfg.test_genes) else: pancan_data = du.load_pancancer_data(verbose=self.verbose) (self.sample_freeze_df, self.mutation_df, self.copy_loss_df, self.copy_gain_df, self.mut_burden_df) = pancan_data # now load training data if not isinstance(train_data_type, str): # if a list of train data types is provided, we have to load each # of them and concatenate columns # n_dim should be a list here self.data_df, self.data_types = du.load_multiple_data_types( train_data_type, n_dims=n_dim, standardize_input=standardize_input, verbose=self.verbose) elif compressed_data: self.data_df = du.load_compressed_data( train_data_type, n_dim=n_dim, verbose=self.verbose, standardize_input=standardize_input, load_subset=(debug or test)) elif train_data_type == 'baseline': # we just want to use non-omics covariates as a baseline # so here, get sample list for expression data, then create an # empty data frame using it as an index if sample_info_df is None: sample_info_df = du.load_sample_info('expression', verbose=self.verbose) self.data_df = pd.DataFrame(index=sample_info_df.index) else: if train_data_type == 'vogelstein_mutations': self.data_df = self._load_vogelstein_mutation_matrix() elif train_data_type == 'significant_mutations': data_df = self._load_vogelstein_mutation_matrix() sig_genes = du.load_significant_genes('methylation') # startswith() with a tuple argument returns True if # the string matches any of the prefixes in the tuple # https://stackoverflow.com/a/20461857 self.data_df = data_df.loc[:, data_df.columns.str. startswith(tuple(sig_genes))] elif 'mutation_preds' in train_data_type: self.data_df = du.load_mutation_predictions(train_data_type) else: self.data_df = du.load_raw_data(train_data_type, verbose=self.verbose, load_subset=(debug or test)) if sample_info_df is None: self.sample_info_df = du.load_sample_info(train_data_type, verbose=self.verbose) else: # sometimes we load sample info in the calling script as part of # argument processing, etc # in that case, we don't need to load it again self.sample_info_df = sample_info_df