def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:] all_features = all_features[new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def main(cli_opts): cfg_opts = _utils.get_output_config('classifier') in_opts = _utils.get_input_config('classifier') minimum_ct = cli_opts['min_count'] # get path to features used for classification if cli_opts['features']: feature_path = cli_opts['features'] else: feature_path = _utils.save_dir + in_opts['gene_feature'] # read in features df = pd.read_csv(feature_path, sep='\t', index_col=0) logger.info('Training R\'s Random forest . . .') rrclf = RRandomForest(df, other_sample_ratio=cli_opts['other_ratio'], driver_sample=cli_opts['driver_rate'], ntrees=cli_opts['ntrees'], seed=cli_opts['random_seed']) # train on entire data if cli_opts['cv']: rrclf.train_cv() else: rrclf.train() logger.info('Finished training.') logger.info('Saving classifier to . . .') if cli_opts['cv']: rrclf.clf.save_cv(cli_opts['output']) else: rrclf.clf.save(cli_opts['output']) logger.info('Finished saving classifier.')
def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene' ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:] all_features = all_features[ new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options[ 'output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def main(hypermutator_count, mut_path, db_path, no_cosmic_flag, opts): """Concatenates all the mutation data from tab delmited files in the cosmic directory. Next, saves the results to a sqlite db. Parameters ---------- hypermutator_count : int remove samples with too many mutations mut_path : str Either path to directory containing contents of COSMIC's genes.tgz file or decompressed CosmicMutantExport.tsv. If empty string, just use path from config file. db_path : str path to save sqlite database. If string is empty, use path from config. no_cosmic_flag : bool indicates not to use cosmic mutations """ # get input/output configurations in_opts = _utils.get_input_config('input') cosmic_path = in_opts['cosmic_path'] out_opts = _utils.get_output_config('gene_tsv') out_path = out_opts['gene_tsv'] cnv_path = out_opts['cnv_tsv'] db_opts = _utils.get_db_config('2020plus') out_db = db_opts['db'] # check if user specifies non standard db path out_db = db_path if db_path else out_db # save info into a txt file and sqlite3 database if not no_cosmic_flag: cosmic_path = mut_path if mut_path else cosmic_path if os.path.isdir(cosmic_path): # concatenate all gene files concatenate_genes(out_path, cosmic_path) # save database save_db(hypermutator_count, out_path, out_db, is_genes_tgz=True, only_genome_wide=opts['only_genome_wide'], use_unknown_status=opts['use_unknown_status']) elif os.path.isfile(cosmic_path): # save database save_db(hypermutator_count, cosmic_path, out_db, is_genes_tgz=False, only_genome_wide=opts['only_genome_wide'], use_unknown_status=opts['use_unknown_status']) else: raise ValueError('Please specify a vlid path to COSMIC data') else: # create an empty table if cosmic not wanted create_empty_cosmic_mutation_table(out_db)
def main(db_path): # get config files in_opts = _utils.get_input_config('input') db_opts = _utils.get_db_config('2020plus') # get absolute path for cosmic data cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path']) # get data for gene_features table logger.info('Processing features for gene_features table ...') if os.path.isdir(cosmic_path): gene_length = recursive_gene_length(in_opts['fasta_dir']) genes, lengths = zip(*gene_length.items()) gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths}) else: gene_length_df = pd.read_csv(cosmic_path, sep='\t') gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']] gene_length_df = gene_length_df.rename(columns={ 'Gene name': 'gene', 'Gene CDS length': 'gene length' }) gene_length_df.drop_duplicates(cols=['gene'], inplace=True) # merge in data from mutsig and biogrid mutsigcv_feature_path = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features']) df = pd.read_csv(mutsigcv_feature_path, sep='\t') df = pd.merge(gene_length_df, df, how='left', on='gene') # merge the data frames biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt') biogrid_df = pd.read_csv(biogrid_path, sep='\t') df = pd.merge(df, biogrid_df, how='left', on='gene') # path to database db_path = db_path if db_path else db_opts['db'] logger.info('Finished processing features for gene_features table.') # save database save_db(df, db_path)
def main(db_path): # get config files in_opts = _utils.get_input_config('input') db_opts = _utils.get_db_config('2020plus') # get absolute path for cosmic data cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path']) # get data for gene_features table logger.info('Processing features for gene_features table ...') if os.path.isdir(cosmic_path): gene_length = recursive_gene_length(in_opts['fasta_dir']) genes, lengths = zip(*gene_length.items()) gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths}) else: gene_length_df = pd.read_csv(cosmic_path, sep='\t') gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']] gene_length_df = gene_length_df.rename(columns={'Gene name': 'gene', 'Gene CDS length': 'gene length'}) gene_length_df.drop_duplicates(cols=['gene'], inplace=True) # merge in data from mutsig and biogrid mutsigcv_feature_path = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features']) df = pd.read_csv(mutsigcv_feature_path, sep='\t') df = pd.merge(gene_length_df, df, how='left', on='gene') # merge the data frames biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt') biogrid_df = pd.read_csv(biogrid_path, sep='\t') df = pd.merge(df, biogrid_df, how='left', on='gene') # path to database db_path = db_path if db_path else db_opts['db'] logger.info('Finished processing features for gene_features table.') # save database save_db(df, db_path)
def main(opts): # read in config file in_opts = _utils.get_input_config('input') # read in prob 20/20 files count_df = pd.read_csv(opts['summary'], sep='\t') tsg_test_df = pd.read_csv(opts['tsg_test'], sep='\t') og_test_df = pd.read_csv(opts['og_test'], sep='\t') og_test_df = og_test_df.rename(columns={'gene':'Gene'}) tsg_test_df = tsg_test_df.rename(columns={'gene':'Gene'}) # make feature matrix feature_df = futils.process_features(count_df) tsg_test_cols = ['Gene', 'inactivating p-value'] feature_df = pd.merge(feature_df, tsg_test_df[tsg_test_cols], how='left', on='Gene') og_test_cols = ['Gene', 'entropy p-value', 'vest p-value', 'combined p-value'] feature_df = pd.merge(feature_df, og_test_df[og_test_cols], how='left', on='Gene') # add covariate feature columns if opts['covariates']: covar_file = opts['covariates'] else: covar_file = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features']) covar_df = pd.read_csv(covar_file, sep='\t') covar_cols = ['gene', 'expression_CCLE', 'replication_time', 'HiC_compartment', ] covar_df = covar_df[covar_cols].rename(columns={'gene': 'Gene'}) feature_df = pd.merge(feature_df, covar_df, how='left', on='Gene') # add biogrid features if present if str(opts['biogrid']).lower() != "no": # set biogrid features from config path if not set by user if opts['biogrid']: biogrid_file = opts['biogrid'] else: biogrid_file = os.path.join(_utils.proj_dir, in_opts['biogrid_features']) # read in biogrid data biogrid_df = pd.read_csv(biogrid_file, sep='\t') biogrid_df = biogrid_df.rename(columns={'gene': 'Gene'}) # permute feature if toggled if opts['permute_biogrid']: prng = np.random.RandomState(opts['random_seed']) bg_feats = ['gene_degree', 'gene_betweeness'] permute_order = prng.choice(len(biogrid_df), size=len(biogrid_df), replace=False) biogrid_df.loc[:,bg_feats] = biogrid_df[bg_feats].loc[permute_order].values # merge in biogrid features feature_df = pd.merge(feature_df, biogrid_df, how='left', on='Gene') feature_df['gene_degree'] = feature_df['gene_degree'].fillna(0) feature_df['gene_betweeness'] = feature_df['gene_betweeness'].fillna(0) # fill na values rename_dict = {'Gene': 'gene'} feature_df = feature_df.rename(columns=rename_dict) feature_df = feature_df.fillna(feature_df.mean()) # setup output cols reflecting feature selection feature_df.to_csv(opts['output'], sep='\t', index=False)
def main(opts): # read in config file in_opts = _utils.get_input_config('input') # read in prob 20/20 files count_df = pd.read_csv(opts['summary'], sep='\t') tsg_test_df = pd.read_csv(opts['tsg_test'], sep='\t') og_test_df = pd.read_csv(opts['og_test'], sep='\t') og_test_df = og_test_df.rename(columns={'gene': 'Gene'}) tsg_test_df = tsg_test_df.rename(columns={'gene': 'Gene'}) # make feature matrix feature_df = futils.process_features(count_df) tsg_test_cols = ['Gene', 'inactivating p-value'] feature_df = pd.merge(feature_df, tsg_test_df[tsg_test_cols], how='left', on='Gene') og_test_cols = [ 'Gene', 'entropy p-value', 'vest p-value', 'combined p-value' ] feature_df = pd.merge(feature_df, og_test_df[og_test_cols], how='left', on='Gene') # add covariate feature columns if opts['covariates']: covar_file = opts['covariates'] else: covar_file = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features']) covar_df = pd.read_csv(covar_file, sep='\t') covar_cols = [ 'gene', 'expression_CCLE', 'replication_time', 'HiC_compartment', ] covar_df = covar_df[covar_cols].rename(columns={'gene': 'Gene'}) feature_df = pd.merge(feature_df, covar_df, how='left', on='Gene') # add biogrid features if present if str(opts['biogrid']).lower() != "no": # set biogrid features from config path if not set by user if opts['biogrid']: biogrid_file = opts['biogrid'] else: biogrid_file = os.path.join(_utils.proj_dir, in_opts['biogrid_features']) # read in biogrid data biogrid_df = pd.read_csv(biogrid_file, sep='\t') biogrid_df = biogrid_df.rename(columns={'gene': 'Gene'}) # permute feature if toggled if opts['permute_biogrid']: prng = np.random.RandomState(opts['random_seed']) bg_feats = ['gene_degree', 'gene_betweeness'] permute_order = prng.choice(len(biogrid_df), size=len(biogrid_df), replace=False) biogrid_df.loc[:, bg_feats] = biogrid_df[bg_feats].loc[ permute_order].values # merge in biogrid features feature_df = pd.merge(feature_df, biogrid_df, how='left', on='Gene') feature_df['gene_degree'] = feature_df['gene_degree'].fillna(0) feature_df['gene_betweeness'] = feature_df['gene_betweeness'].fillna(0) # fill na values rename_dict = {'Gene': 'gene'} feature_df = feature_df.rename(columns=rename_dict) feature_df = feature_df.fillna(feature_df.mean()) # setup output cols reflecting feature selection feature_df.to_csv(opts['output'], sep='\t', index=False)
def main(cli_opts): cfg_opts = _utils.get_output_config('classifier') in_opts = _utils.get_input_config('classifier') minimum_ct = cli_opts['min_count'] # get path to features used for classification if cli_opts['features']: feature_path = cli_opts['features'] else: feature_path = _utils.save_dir + in_opts['gene_feature'] # read in null distribution p-values if not cli_opts['simulated'] and cli_opts['null_distribution']: null_pvals = pd.read_csv(cli_opts['null_distribution'], sep='\t', index_col=0) else: null_pvals = None # use trained classifier if provided if cli_opts['trained_classifier']: # read in features df = pd.read_csv(feature_path, sep='\t', index_col=0) logger.info('Running Random forest . . .') # initialize R's random forest rrclf = RRandomForest(df, other_sample_ratio=cli_opts['other_ratio'], driver_sample=cli_opts['driver_rate'], ntrees=cli_opts['ntrees'], seed=cli_opts['random_seed']) # load classifier depending on whether it uses CV is_cv = cli_opts['cv'] if is_cv: rrclf.clf.load_cv(cli_opts['trained_classifier']) else: rrclf.clf.load(cli_opts['trained_classifier']) if cli_opts['simulated']: # do classification result_df = trained_rand_forest_pred(rrclf, df, None, null_pvals, is_cv) # driver scores driver_score_cts = result_df['driver score'].value_counts() driver_score_cts = driver_score_cts.sort_index(ascending=False) driver_score_cum_cts = driver_score_cts.cumsum() driver_score_pvals = driver_score_cum_cts / float(driver_score_cts.sum()) # oncogene scores onco_score_cts = result_df['oncogene score'].value_counts() onco_score_cts = onco_score_cts.sort_index(ascending=False) onco_score_cum_cts = onco_score_cts.cumsum() onco_score_pvals = onco_score_cum_cts / float(onco_score_cts.sum()) # tsg score tsg_score_cts = result_df['tsg score'].value_counts() tsg_score_cts = tsg_score_cts.sort_index(ascending=False) tsg_score_cum_cts = tsg_score_cts.cumsum() tsg_score_pvals = tsg_score_cum_cts / float(tsg_score_cts.sum()) # construct null p-value score distribution score_ix = set(driver_score_pvals.index) | set(onco_score_pvals.index) | set(tsg_score_pvals.index) score_pvals = pd.DataFrame(index=list(score_ix)) score_pvals['oncogene p-value'] = onco_score_pvals score_pvals['tsg p-value'] = tsg_score_pvals score_pvals['driver p-value'] = driver_score_pvals score_pvals = score_pvals.sort_index(ascending=False) score_pvals.to_csv(cli_opts['null_distribution'], sep='\t', index_label='score') else: # do classification pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred'] logger.info('Saving results to {0}'.format(pred_results_path)) result_df = trained_rand_forest_pred(rrclf, df, pred_results_path, null_pvals, is_cv) result_df.to_csv(pred_results_path, sep='\t') # create qq plot try: qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot'] plot_data.create_qqplots(result_df, qq_plot_path) except: pass logger.info('Finished classification.') return df = pd.read_csv(feature_path, sep='\t', index_col=0) # R's random forest logger.info('Running Random forest . . .') # initialize R's random forest rrclf = RRandomForest(df, other_sample_ratio=cli_opts['other_ratio'], driver_sample=cli_opts['driver_rate'], ntrees=cli_opts['ntrees'], seed=cli_opts['random_seed']) # analyze classification metrics rrclf.kfold_validation() rrclf_onco_tpr, rrclf_onco_fpr, rrclf_onco_mean_roc_auc = rrclf.get_onco_roc_metrics() rrclf_onco_precision, rrclf_onco_recall, rrclf_onco_mean_pr_auc = rrclf.get_onco_pr_metrics() rrclf_tsg_tpr, rrclf_tsg_fpr, rrclf_tsg_mean_roc_auc = rrclf.get_tsg_roc_metrics() rrclf_tsg_precision, rrclf_tsg_recall, rrclf_tsg_mean_pr_auc = rrclf.get_tsg_pr_metrics() rrclf_driver_precision, rrclf_driver_recall, rrclf_driver_mean_pr_auc = rrclf.get_driver_pr_metrics() rrclf_driver_tpr, rrclf_driver_fpr, rrclf_driver_mean_roc_auc = rrclf.get_driver_roc_metrics() # skip if no matplotlib try: # plot feature importance mean_df = rrclf.mean_importance std_df = rrclf.std_importance feat_path = _utils.clf_plot_dir + cfg_opts['r_feature_importance_plot'] plot_data.feature_importance_barplot(mean_df, std_df, feat_path) except: pass # run predictions using R's random forest pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred'] result_df = rand_forest_pred(rrclf, df, result_path=pred_results_path, null_dist=null_pvals) # save a list of oncogenes/tsgs in separate files if null_pvals is None: pred_onco = result_df[result_df['majority vote class']==_utils.onco_label].index.to_series() novel_onco = result_df[(result_df['majority vote class']==_utils.onco_label) & (result_df['training list class']!=_utils.onco_label)].index.to_series() pred_tsg = result_df[result_df['majority vote class']==_utils.tsg_label].index.to_series() novel_tsg = result_df[(result_df['majority vote class']==_utils.tsg_label) & (result_df['training list class']!=_utils.tsg_label)].index.to_series() pred_driver = result_df[result_df['majority vote cancer gene']==1].index.to_series() pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None) novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None) pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None) novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None) log_str = ('Majority vote Random forest: {0} ({1} novel) oncogenes, ' '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco), len(pred_tsg), len(novel_tsg))) logger.info(log_str) else: pred_onco = result_df[result_df['oncogene q-value']<=.1].index.to_series() novel_onco = result_df[(result_df['oncogene q-value']<=.1) & (result_df['training list class']!=_utils.onco_label)].index.to_series() pred_tsg = result_df[result_df['tsg q-value']<=.1].index.to_series() novel_tsg = result_df[(result_df['tsg q-value']<=.1) & (result_df['training list class']!=_utils.tsg_label)].index.to_series() pred_driver = result_df[result_df['driver q-value']<=.1].index.to_series() pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None) novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None) pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None) novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None) log_str = ('Random forest significance test: {0} ({1} novel) oncogenes, ' '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco), len(pred_tsg), len(novel_tsg))) logger.info(log_str) # only plot if matplotlib try: # plot r random forest results plot_data.prob_scatter(result_df, plot_path=_utils.clf_plot_dir + cfg_opts['rrand_forest_plot'], title='Sub-sampled Random Forest Predictions') plot_data.prob_kde(result_df, col_name='oncogene score', save_path=_utils.clf_plot_dir + cfg_opts['onco_kde_rrand_forest'], title='Distribution of Oncogene Scores (sub-sampled random forest)') plot_data.prob_kde(result_df, col_name='tsg score', save_path=_utils.clf_plot_dir + cfg_opts['tsg_kde_rrand_forest'], title='Distribution of TSG Scores (sub-sampled random forest)') logger.info('Finished running sub-sampled Random Forest') # dummy classifier, predict most frequent logger.debug('Running Dummy Classifier. . .') dclf = DummyClf(df, strategy='most_frequent', min_ct=minimum_ct, weight=False) dclf.kfold_validation() dclf_onco_tpr, dclf_onco_fpr, dclf_onco_mean_roc_auc = dclf.get_onco_roc_metrics() dclf_onco_precision, dclf_onco_recall, dclf_onco_mean_pr_auc = dclf.get_onco_pr_metrics() dclf_tsg_tpr, dclf_tsg_fpr, dclf_tsg_mean_roc_auc = dclf.get_tsg_roc_metrics() dclf_tsg_precision, dclf_tsg_recall, dclf_tsg_mean_pr_auc = dclf.get_tsg_pr_metrics() dclf_driver_tpr, dclf_driver_fpr, dclf_driver_mean_roc_auc = dclf.get_driver_roc_metrics() logger.debug('Finished dummy classifier.') # plot oncogene roc figure rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_roc_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_roc_auc rrclf_onco_mean_tpr = np.mean(rrclf_onco_tpr, axis=0) dclf_onco_mean_tpr = np.mean(dclf_onco_tpr, axis=0) df = pd.DataFrame({ rrandom_forest_str: rrclf_onco_mean_tpr, dummy_str: dclf_onco_mean_tpr}, index=rrclf_onco_fpr) line_style = {dummy_str: '--', rrandom_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_oncogene'] plot_data.receiver_operator_curve(df, save_path, line_style) # plot tsg roc figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_roc_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_roc_auc rrclf_tsg_mean_tpr = np.mean(rrclf_tsg_tpr, axis=0) dclf_tsg_mean_tpr = np.mean(dclf_tsg_tpr, axis=0) df = pd.DataFrame({r_random_forest_str: rrclf_tsg_mean_tpr, dummy_str: dclf_tsg_mean_tpr}, index=rrclf_tsg_fpr) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_tsg'] plot_data.receiver_operator_curve(df, save_path, line_style) # plot driver roc figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_roc_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_driver_mean_roc_auc rrclf_driver_mean_tpr = np.mean(rrclf_driver_tpr, axis=0) dclf_driver_mean_tpr = np.mean(dclf_driver_tpr, axis=0) df = pd.DataFrame({r_random_forest_str: rrclf_driver_mean_tpr, dummy_str: dclf_driver_mean_tpr}, index=rrclf_driver_fpr) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_driver'] plot_data.receiver_operator_curve(df, save_path, line_style) # plot oncogene pr figure rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_pr_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_pr_auc rrclf_onco_mean_precision = np.mean(rrclf_onco_precision, axis=0) dclf_onco_mean_precision = np.mean(dclf_onco_precision, axis=0) df = pd.DataFrame({rrandom_forest_str: rrclf_onco_mean_precision,}, index=rrclf_onco_recall) line_style = {dummy_str: '--', rrandom_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_oncogene'] plot_data.precision_recall_curve(df, save_path, line_style, #sem_df, title='Oncogene Precision-Recall Curve') # plot tsg pr figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_pr_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_pr_auc rrclf_tsg_mean_precision = np.mean(rrclf_tsg_precision, axis=0) dclf_tsg_mean_precision = np.mean(dclf_tsg_precision, axis=0) df = pd.DataFrame({ r_random_forest_str: rrclf_tsg_mean_precision, }, index=rrclf_tsg_recall) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_tsg'] plot_data.precision_recall_curve(df, save_path, line_style, title='TSG Precision-Recall Curve') # plot driver gene pr figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_pr_auc rrclf_driver_mean_precision = np.mean(rrclf_driver_precision, axis=0) df = pd.DataFrame({ r_random_forest_str: rrclf_driver_mean_precision, }, index=rrclf_driver_recall) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_driver'] plot_data.precision_recall_curve(df, save_path, line_style, title='Driver Precision-Recall Curve') # save performance metrics of ROC and PR AUC save_path = _utils.clf_result_dir + cfg_opts['performance'] logger.info('Saving performance metrics ({0}) . . .'.format(save_path)) metrics = [['TSG', rrclf_tsg_mean_roc_auc, rrclf_tsg_mean_pr_auc], ['OG', rrclf_onco_mean_roc_auc, rrclf_onco_mean_pr_auc], ['Driver', rrclf_driver_mean_roc_auc, rrclf_driver_mean_pr_auc]] perf_df = pd.DataFrame(metrics, columns=['Type', 'ROC AUC', 'PR AUC']) perf_df.to_csv(save_path, sep='\t', index=False) # make qq plot qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot'] plot_data.create_qqplots(result_df, qq_plot_path) except: pass