plot_metric = 'auroc' images_dir = Path(cfg.images_dirs['multimodal'], 'auroc') else: plot_metric = 'aupr' images_dir = Path(cfg.images_dirs['multimodal']) # ## Results with compressed features (figures in main paper) # # We'll also look at results with raw features later, those figures go in the supplement. # # ### Compare raw results # In[3]: # load raw data results_df = au.load_stratified_prediction_results(results_dir, 'gene') # drop TET2 for now results_df = results_df[~(results_df.identifier == 'TET2')].copy() # make sure that we have data for all data types and for two replicates (random seeds) print(results_df.shape) print(results_df.seed.unique()) print(results_df.identifier.unique()) print(results_df.training_data.unique()) results_df.head() # In[4]: # each subplot will show results for one gene sns.set({'figure.figsize': (24, 12)})
import mpmp.utilities.plot_utilities as plu # In[2]: # set results directory old_results_dir = Path(cfg.results_dirs['mutation'], 'methylation_results', 'gene').resolve() new_results_dir = Path(cfg.results_dirs['mutation'], 'methylation_results_shuffle_cancer_type', 'gene').resolve() # In[3]: # load raw data old_results_df = au.load_stratified_prediction_results(old_results_dir, 'gene') new_results_df = au.load_stratified_prediction_results(new_results_dir, 'gene') # here we want to use compressed data for methylation datasets (27k and 450k) # the results in 02_classify_compressed/compressed_vs_raw_results.ipynb show that # performance is equal or slightly better for PCA compressed methylation data, # and it's much easier/faster to fit models on old_results_df = old_results_df[old_results_df.training_data.isin( ['expression'])].copy() new_results_df = new_results_df[new_results_df.training_data.isin( ['expression'])].copy() old_compressed_results_df = au.load_compressed_prediction_results( old_results_dir, 'gene', old_filenames=True) # load compressed data for me_27k and me_450k old_compressed_results_df = old_compressed_results_df[
# if True, save figures to ./images directory SAVE_FIGS = True # if True, plot AUROC instead of AUPR PLOT_AUROC = False if PLOT_AUROC: plot_metric = 'auroc' images_dir = Path(cfg.images_dirs['mutation'], 'auroc') else: plot_metric = 'aupr' images_dir = Path(cfg.images_dirs['mutation']) # In[3]: # load raw data vogelstein_df = au.load_stratified_prediction_results(vogelstein_results_dir, 'gene') vogelstein_df = vogelstein_df[vogelstein_df.training_data.isin(['expression'])] vogelstein_df['gene_set'] = 'vogelstein' # make sure that we're correctly pointing to raw data for non-methylation data types # and that we have data for two replicates (two random seeds) print(vogelstein_df.shape) print(vogelstein_df.seed.unique()) print(vogelstein_df.training_data.unique()) vogelstein_df.head() # In[4]: # load raw data top_50_df = au.load_stratified_prediction_results(top_50_results_dir, 'gene') top_50_df = top_50_df[top_50_df.training_data.isin(['expression'])]
# set results directory # this is a mess, TODO move results into same location results_dir1 = Path(cfg.results_dirs['mutation'], 'bmiq_results', 'gene').resolve() results_dir2 = Path(cfg.results_dirs['mutation'], 'bmiq_results_2', 'gene').resolve() results_dir3 = Path(cfg.results_dirs['mutation'], 'bmiq_results_me_control', 'gene').resolve() # set significance cutoff after FDR correction SIG_ALPHA = 0.001 # In[3]: # load raw data results_df1 = au.load_stratified_prediction_results(results_dir1, 'gene') results_df1.loc[results_df1.training_data == 'me_27k', 'training_data'] = 'me_27k_corrected' results_df2 = au.load_stratified_prediction_results(results_dir2, 'gene') results_df2.loc[results_df2.training_data == 'me_27k', 'training_data'] = 'me_27k_corrected' results_df3 = au.load_stratified_prediction_results(results_dir3, 'gene') results_df = pd.concat((results_df1, results_df2, results_df3)) print(results_df.shape) print(results_df.seed.unique()) print(results_df.training_data.unique()) results_df.head() # In[4]:
from adjustText import adjust_text import mpmp.config as cfg import mpmp.utilities.analysis_utilities as au # In[2]: # set results directory results_dir = Path(cfg.results_dirs['cancer_type'], 'cancer_type').resolve() # set significance cutoff after FDR correction SIG_ALPHA = 0.001 # In[3]: # load raw data results_df = au.load_stratified_prediction_results(results_dir, 'cancer_type') print(results_df.shape) results_df.head() # In[4]: # set this variable to filter plot to certain cancer types # if not included, plot all 33 of them # FILTER_CANCER_TYPES = ['LUAD', 'LUSC', 'THCA'] FILTER_CANCER_TYPES = None filtered_df = results_df[(results_df.signal == 'signal') & (results_df.data_type == 'test')].copy() if FILTER_CANCER_TYPES is not None: filtered_df = filtered_df[( filtered_df.cancer_type.isin(FILTER_CANCER_TYPES))] else: