Exemplo n.º 1
0
def main_cosmic(options):
    """Main function used to process COSMIC data."""
    # get configs
    in_opts = _utils.get_input_config('classifier')
    out_opts = _utils.get_output_config('features')
    count_opts = _utils.get_output_config('feature_matrix')
    # result_opts = _utils.get_input_config('result')
    db_cfg = _utils.get_db_config('2020plus')

    # get mutations
    conn = sqlite3.connect(db_cfg['db'])
    sql = ("SELECT Gene, Protein_Change as AminoAcid, "
            "       DNA_Change as Nucleotide, "
            "       Variant_Classification, "
            "       Tumor_Sample, Tumor_Type "
            "FROM mutations")
    mut_df = psql.frame_query(sql, con=conn)
    conn.close()

    # get features for classification
    all_features = futils.generate_features(mut_df, options)

    # save features to text file
    cols = all_features.columns.tolist()
    new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:]
    all_features = all_features[new_order]  # make the gene name the first column
    out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output']
    all_features.to_csv(out_path, sep='\t', index=False)
Exemplo n.º 2
0
def main(cli_opts):
    cfg_opts = _utils.get_output_config('classifier')
    in_opts = _utils.get_input_config('classifier')
    minimum_ct = cli_opts['min_count']

    # get path to features used for classification
    if cli_opts['features']:
        feature_path = cli_opts['features']
    else:
        feature_path = _utils.save_dir + in_opts['gene_feature']

    # read in features
    df = pd.read_csv(feature_path,
                     sep='\t', index_col=0)

    logger.info('Training R\'s Random forest . . .')
    rrclf = RRandomForest(df,
                          other_sample_ratio=cli_opts['other_ratio'],
                          driver_sample=cli_opts['driver_rate'],
                          ntrees=cli_opts['ntrees'],
                          seed=cli_opts['random_seed'])
    # train on entire data
    if cli_opts['cv']:
        rrclf.train_cv()
    else:
        rrclf.train()
    logger.info('Finished training.')
    logger.info('Saving classifier to . . .')
    if cli_opts['cv']:
        rrclf.clf.save_cv(cli_opts['output'])
    else:
        rrclf.clf.save(cli_opts['output'])
    logger.info('Finished saving classifier.')
Exemplo n.º 3
0
def main_cosmic(options):
    """Main function used to process COSMIC data."""
    # get configs
    in_opts = _utils.get_input_config('classifier')
    out_opts = _utils.get_output_config('features')
    count_opts = _utils.get_output_config('feature_matrix')
    # result_opts = _utils.get_input_config('result')
    db_cfg = _utils.get_db_config('2020plus')

    # get mutations
    conn = sqlite3.connect(db_cfg['db'])
    sql = ("SELECT Gene, Protein_Change as AminoAcid, "
           "       DNA_Change as Nucleotide, "
           "       Variant_Classification, "
           "       Tumor_Sample, Tumor_Type "
           "FROM mutations")
    mut_df = psql.frame_query(sql, con=conn)
    conn.close()

    # get features for classification
    all_features = futils.generate_features(mut_df, options)

    # save features to text file
    cols = all_features.columns.tolist()
    new_order = ['gene'
                 ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:]
    all_features = all_features[
        new_order]  # make the gene name the first column
    out_path = _utils.save_dir + in_opts['gene_features'] if not options[
        'output'] else options['output']
    all_features.to_csv(out_path, sep='\t', index=False)
Exemplo n.º 4
0
def main(cli_opts):
    cfg_opts = _utils.get_output_config('classifier')
    in_opts = _utils.get_input_config('classifier')
    minimum_ct = cli_opts['min_count']

    # get path to features used for classification
    if cli_opts['features']:
        feature_path = cli_opts['features']
    else:
        feature_path = _utils.save_dir + in_opts['gene_feature']

    # read in features
    df = pd.read_csv(feature_path, sep='\t', index_col=0)

    logger.info('Training R\'s Random forest . . .')
    rrclf = RRandomForest(df,
                          other_sample_ratio=cli_opts['other_ratio'],
                          driver_sample=cli_opts['driver_rate'],
                          ntrees=cli_opts['ntrees'],
                          seed=cli_opts['random_seed'])
    # train on entire data
    if cli_opts['cv']:
        rrclf.train_cv()
    else:
        rrclf.train()
    logger.info('Finished training.')
    logger.info('Saving classifier to . . .')
    if cli_opts['cv']:
        rrclf.clf.save_cv(cli_opts['output'])
    else:
        rrclf.clf.save(cli_opts['output'])
    logger.info('Finished saving classifier.')
Exemplo n.º 5
0
def main(hypermutator_count, mut_path, db_path, no_cosmic_flag, opts):
    """Concatenates all the mutation data from tab delmited files in
    the cosmic directory. Next, saves the results to a sqlite db.

    Parameters
    ----------
    hypermutator_count : int
        remove samples with too many mutations
    mut_path : str
        Either path to directory containing contents of COSMIC's
        genes.tgz file or decompressed CosmicMutantExport.tsv.
        If empty string, just use path from config file.
    db_path : str
        path to save sqlite database. If string is empty,
        use path from config.
    no_cosmic_flag : bool
        indicates not to use cosmic mutations
    """
    # get input/output configurations
    in_opts = _utils.get_input_config('input')
    cosmic_path = in_opts['cosmic_path']
    out_opts = _utils.get_output_config('gene_tsv')
    out_path = out_opts['gene_tsv']
    cnv_path = out_opts['cnv_tsv']
    db_opts = _utils.get_db_config('2020plus')
    out_db = db_opts['db']

    # check if user specifies non standard db path
    out_db = db_path if db_path else out_db

    # save info into a txt file and sqlite3 database
    if not no_cosmic_flag:
        cosmic_path = mut_path if mut_path else cosmic_path
        if os.path.isdir(cosmic_path):
            # concatenate all gene files
            concatenate_genes(out_path, cosmic_path)
            # save database
            save_db(hypermutator_count,
                    out_path,
                    out_db,
                    is_genes_tgz=True,
                    only_genome_wide=opts['only_genome_wide'],
                    use_unknown_status=opts['use_unknown_status'])
        elif os.path.isfile(cosmic_path):
            # save database
            save_db(hypermutator_count,
                    cosmic_path,
                    out_db,
                    is_genes_tgz=False,
                    only_genome_wide=opts['only_genome_wide'],
                    use_unknown_status=opts['use_unknown_status'])
        else:
            raise ValueError('Please specify a vlid path to COSMIC data')
    else:
        # create an empty table if cosmic not wanted
        create_empty_cosmic_mutation_table(out_db)
Exemplo n.º 6
0
def main(db_path):
    # get config files
    in_opts = _utils.get_input_config('input')
    db_opts = _utils.get_db_config('2020plus')

    # get absolute path for cosmic data
    cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path'])

    # get data for gene_features table
    logger.info('Processing features for gene_features table ...')
    if os.path.isdir(cosmic_path):
        gene_length = recursive_gene_length(in_opts['fasta_dir'])
        genes, lengths = zip(*gene_length.items())
        gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths})
    else:
        gene_length_df = pd.read_csv(cosmic_path, sep='\t')
        gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']]
        gene_length_df = gene_length_df.rename(columns={
            'Gene name': 'gene',
            'Gene CDS length': 'gene length'
        })
        gene_length_df.drop_duplicates(cols=['gene'], inplace=True)

    # merge in data from mutsig and biogrid
    mutsigcv_feature_path = os.path.join(_utils.proj_dir,
                                         in_opts['mutsigcv_features'])
    df = pd.read_csv(mutsigcv_feature_path, sep='\t')
    df = pd.merge(gene_length_df, df, how='left',
                  on='gene')  # merge the data frames
    biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt')
    biogrid_df = pd.read_csv(biogrid_path, sep='\t')
    df = pd.merge(df, biogrid_df, how='left', on='gene')

    # path to database
    db_path = db_path if db_path else db_opts['db']

    logger.info('Finished processing features for gene_features table.')

    # save database
    save_db(df, db_path)
Exemplo n.º 7
0
def main(db_path):
    # get config files
    in_opts = _utils.get_input_config('input')
    db_opts = _utils.get_db_config('2020plus')

    # get absolute path for cosmic data
    cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path'])

    # get data for gene_features table
    logger.info('Processing features for gene_features table ...')
    if os.path.isdir(cosmic_path):
        gene_length = recursive_gene_length(in_opts['fasta_dir'])
        genes, lengths = zip(*gene_length.items())
        gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths})
    else:
        gene_length_df = pd.read_csv(cosmic_path, sep='\t')
        gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']]
        gene_length_df = gene_length_df.rename(columns={'Gene name': 'gene',
                                                        'Gene CDS length': 'gene length'})
        gene_length_df.drop_duplicates(cols=['gene'], inplace=True)

    # merge in data from mutsig and biogrid
    mutsigcv_feature_path = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features'])
    df = pd.read_csv(mutsigcv_feature_path, sep='\t')
    df = pd.merge(gene_length_df, df, how='left', on='gene')  # merge the data frames
    biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt')
    biogrid_df = pd.read_csv(biogrid_path, sep='\t')
    df = pd.merge(df, biogrid_df, how='left', on='gene')

    # path to database
    db_path = db_path if db_path else db_opts['db']

    logger.info('Finished processing features for gene_features table.')

    # save database
    save_db(df, db_path)
Exemplo n.º 8
0
def main(opts):
    # read in config file
    in_opts = _utils.get_input_config('input')

    # read in prob 20/20 files
    count_df = pd.read_csv(opts['summary'], sep='\t')
    tsg_test_df = pd.read_csv(opts['tsg_test'], sep='\t')
    og_test_df = pd.read_csv(opts['og_test'], sep='\t')
    og_test_df = og_test_df.rename(columns={'gene':'Gene'})
    tsg_test_df = tsg_test_df.rename(columns={'gene':'Gene'})

    # make feature matrix
    feature_df = futils.process_features(count_df)
    tsg_test_cols = ['Gene', 'inactivating p-value']
    feature_df = pd.merge(feature_df, tsg_test_df[tsg_test_cols],
                          how='left', on='Gene')
    og_test_cols = ['Gene', 'entropy p-value', 'vest p-value', 'combined p-value']
    feature_df = pd.merge(feature_df, og_test_df[og_test_cols],
                          how='left', on='Gene')

    # add covariate feature columns
    if opts['covariates']:
        covar_file = opts['covariates']
    else:
        covar_file = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features'])
    covar_df = pd.read_csv(covar_file, sep='\t')
    covar_cols = ['gene',
                  'expression_CCLE',
                  'replication_time',
                  'HiC_compartment',
                 ]
    covar_df = covar_df[covar_cols].rename(columns={'gene': 'Gene'})
    feature_df = pd.merge(feature_df, covar_df,
                          how='left', on='Gene')

    # add biogrid features if present
    if str(opts['biogrid']).lower() != "no":
        # set biogrid features from config path if not set by user
        if opts['biogrid']:
            biogrid_file = opts['biogrid']
        else:
            biogrid_file = os.path.join(_utils.proj_dir, in_opts['biogrid_features'])
        # read in biogrid data
        biogrid_df = pd.read_csv(biogrid_file, sep='\t')
        biogrid_df = biogrid_df.rename(columns={'gene': 'Gene'})

        # permute feature if toggled
        if opts['permute_biogrid']:
            prng = np.random.RandomState(opts['random_seed'])
            bg_feats = ['gene_degree', 'gene_betweeness']
            permute_order = prng.choice(len(biogrid_df),
                                        size=len(biogrid_df),
                                        replace=False)
            biogrid_df.loc[:,bg_feats] = biogrid_df[bg_feats].loc[permute_order].values

        # merge in biogrid features
        feature_df = pd.merge(feature_df, biogrid_df, how='left', on='Gene')
        feature_df['gene_degree'] = feature_df['gene_degree'].fillna(0)
        feature_df['gene_betweeness'] = feature_df['gene_betweeness'].fillna(0)

    # fill na values
    rename_dict = {'Gene': 'gene'}
    feature_df = feature_df.rename(columns=rename_dict)
    feature_df = feature_df.fillna(feature_df.mean())

    # setup output cols reflecting feature selection
    feature_df.to_csv(opts['output'], sep='\t', index=False)
Exemplo n.º 9
0
def main(opts):
    # read in config file
    in_opts = _utils.get_input_config('input')

    # read in prob 20/20 files
    count_df = pd.read_csv(opts['summary'], sep='\t')
    tsg_test_df = pd.read_csv(opts['tsg_test'], sep='\t')
    og_test_df = pd.read_csv(opts['og_test'], sep='\t')
    og_test_df = og_test_df.rename(columns={'gene': 'Gene'})
    tsg_test_df = tsg_test_df.rename(columns={'gene': 'Gene'})

    # make feature matrix
    feature_df = futils.process_features(count_df)
    tsg_test_cols = ['Gene', 'inactivating p-value']
    feature_df = pd.merge(feature_df,
                          tsg_test_df[tsg_test_cols],
                          how='left',
                          on='Gene')
    og_test_cols = [
        'Gene', 'entropy p-value', 'vest p-value', 'combined p-value'
    ]
    feature_df = pd.merge(feature_df,
                          og_test_df[og_test_cols],
                          how='left',
                          on='Gene')

    # add covariate feature columns
    if opts['covariates']:
        covar_file = opts['covariates']
    else:
        covar_file = os.path.join(_utils.proj_dir,
                                  in_opts['mutsigcv_features'])
    covar_df = pd.read_csv(covar_file, sep='\t')
    covar_cols = [
        'gene',
        'expression_CCLE',
        'replication_time',
        'HiC_compartment',
    ]
    covar_df = covar_df[covar_cols].rename(columns={'gene': 'Gene'})
    feature_df = pd.merge(feature_df, covar_df, how='left', on='Gene')

    # add biogrid features if present
    if str(opts['biogrid']).lower() != "no":
        # set biogrid features from config path if not set by user
        if opts['biogrid']:
            biogrid_file = opts['biogrid']
        else:
            biogrid_file = os.path.join(_utils.proj_dir,
                                        in_opts['biogrid_features'])
        # read in biogrid data
        biogrid_df = pd.read_csv(biogrid_file, sep='\t')
        biogrid_df = biogrid_df.rename(columns={'gene': 'Gene'})

        # permute feature if toggled
        if opts['permute_biogrid']:
            prng = np.random.RandomState(opts['random_seed'])
            bg_feats = ['gene_degree', 'gene_betweeness']
            permute_order = prng.choice(len(biogrid_df),
                                        size=len(biogrid_df),
                                        replace=False)
            biogrid_df.loc[:, bg_feats] = biogrid_df[bg_feats].loc[
                permute_order].values

        # merge in biogrid features
        feature_df = pd.merge(feature_df, biogrid_df, how='left', on='Gene')
        feature_df['gene_degree'] = feature_df['gene_degree'].fillna(0)
        feature_df['gene_betweeness'] = feature_df['gene_betweeness'].fillna(0)

    # fill na values
    rename_dict = {'Gene': 'gene'}
    feature_df = feature_df.rename(columns=rename_dict)
    feature_df = feature_df.fillna(feature_df.mean())

    # setup output cols reflecting feature selection
    feature_df.to_csv(opts['output'], sep='\t', index=False)
Exemplo n.º 10
0
def main(cli_opts):
    cfg_opts = _utils.get_output_config('classifier')
    in_opts = _utils.get_input_config('classifier')
    minimum_ct = cli_opts['min_count']

    # get path to features used for classification
    if cli_opts['features']:
        feature_path = cli_opts['features']
    else:
        feature_path = _utils.save_dir + in_opts['gene_feature']

    # read in null distribution p-values
    if not cli_opts['simulated'] and cli_opts['null_distribution']:
        null_pvals = pd.read_csv(cli_opts['null_distribution'], sep='\t',
                                 index_col=0)
    else:
        null_pvals = None

    # use trained classifier if provided
    if cli_opts['trained_classifier']:
        # read in features
        df = pd.read_csv(feature_path, sep='\t', index_col=0)

        logger.info('Running Random forest . . .')

        # initialize R's random forest
        rrclf = RRandomForest(df,
                              other_sample_ratio=cli_opts['other_ratio'],
                              driver_sample=cli_opts['driver_rate'],
                              ntrees=cli_opts['ntrees'],
                              seed=cli_opts['random_seed'])
        # load classifier depending on whether it uses CV
        is_cv = cli_opts['cv']
        if is_cv:
            rrclf.clf.load_cv(cli_opts['trained_classifier'])
        else:
            rrclf.clf.load(cli_opts['trained_classifier'])

        if cli_opts['simulated']:
            # do classification
            result_df = trained_rand_forest_pred(rrclf, df, None, null_pvals, is_cv)

            # driver scores
            driver_score_cts = result_df['driver score'].value_counts()
            driver_score_cts = driver_score_cts.sort_index(ascending=False)
            driver_score_cum_cts = driver_score_cts.cumsum()
            driver_score_pvals = driver_score_cum_cts / float(driver_score_cts.sum())

            # oncogene scores
            onco_score_cts = result_df['oncogene score'].value_counts()
            onco_score_cts = onco_score_cts.sort_index(ascending=False)
            onco_score_cum_cts = onco_score_cts.cumsum()
            onco_score_pvals = onco_score_cum_cts / float(onco_score_cts.sum())

            # tsg score
            tsg_score_cts = result_df['tsg score'].value_counts()
            tsg_score_cts = tsg_score_cts.sort_index(ascending=False)
            tsg_score_cum_cts = tsg_score_cts.cumsum()
            tsg_score_pvals = tsg_score_cum_cts / float(tsg_score_cts.sum())

            # construct null p-value score distribution
            score_ix = set(driver_score_pvals.index) | set(onco_score_pvals.index) | set(tsg_score_pvals.index)
            score_pvals = pd.DataFrame(index=list(score_ix))
            score_pvals['oncogene p-value'] = onco_score_pvals
            score_pvals['tsg p-value'] = tsg_score_pvals
            score_pvals['driver p-value'] = driver_score_pvals
            score_pvals = score_pvals.sort_index(ascending=False)

            score_pvals.to_csv(cli_opts['null_distribution'], sep='\t',
                               index_label='score')
        else:
            # do classification
            pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred']
            logger.info('Saving results to {0}'.format(pred_results_path))
            result_df = trained_rand_forest_pred(rrclf, df, pred_results_path,
                                                 null_pvals, is_cv)
            result_df.to_csv(pred_results_path, sep='\t')

            # create qq plot
            try:
                qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot']
                plot_data.create_qqplots(result_df, qq_plot_path)
            except:
                pass

        logger.info('Finished classification.')
        return

    df = pd.read_csv(feature_path, sep='\t', index_col=0)

    # R's random forest
    logger.info('Running Random forest . . .')
    # initialize R's random forest
    rrclf = RRandomForest(df,
                          other_sample_ratio=cli_opts['other_ratio'],
                          driver_sample=cli_opts['driver_rate'],
                          ntrees=cli_opts['ntrees'],
                          seed=cli_opts['random_seed'])

    # analyze classification metrics
    rrclf.kfold_validation()
    rrclf_onco_tpr, rrclf_onco_fpr, rrclf_onco_mean_roc_auc = rrclf.get_onco_roc_metrics()
    rrclf_onco_precision, rrclf_onco_recall, rrclf_onco_mean_pr_auc = rrclf.get_onco_pr_metrics()
    rrclf_tsg_tpr, rrclf_tsg_fpr, rrclf_tsg_mean_roc_auc = rrclf.get_tsg_roc_metrics()
    rrclf_tsg_precision, rrclf_tsg_recall, rrclf_tsg_mean_pr_auc = rrclf.get_tsg_pr_metrics()
    rrclf_driver_precision, rrclf_driver_recall, rrclf_driver_mean_pr_auc = rrclf.get_driver_pr_metrics()
    rrclf_driver_tpr, rrclf_driver_fpr, rrclf_driver_mean_roc_auc = rrclf.get_driver_roc_metrics()

    # skip if no matplotlib
    try:
        # plot feature importance
        mean_df = rrclf.mean_importance
        std_df = rrclf.std_importance
        feat_path = _utils.clf_plot_dir + cfg_opts['r_feature_importance_plot']
        plot_data.feature_importance_barplot(mean_df, std_df, feat_path)
    except:
        pass

    # run predictions using R's random forest
    pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred']
    result_df = rand_forest_pred(rrclf, df, result_path=pred_results_path,
                                 null_dist=null_pvals)

    # save a list of oncogenes/tsgs in separate files
    if null_pvals is None:
        pred_onco = result_df[result_df['majority vote class']==_utils.onco_label].index.to_series()
        novel_onco = result_df[(result_df['majority vote class']==_utils.onco_label) & (result_df['training list class']!=_utils.onco_label)].index.to_series()
        pred_tsg = result_df[result_df['majority vote class']==_utils.tsg_label].index.to_series()
        novel_tsg = result_df[(result_df['majority vote class']==_utils.tsg_label) & (result_df['training list class']!=_utils.tsg_label)].index.to_series()
        pred_driver = result_df[result_df['majority vote cancer gene']==1].index.to_series()
        pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None)
        novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None)
        pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None)
        novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None)
        log_str = ('Majority vote Random forest: {0} ({1} novel) oncogenes, '
                   '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco),
                                                len(pred_tsg), len(novel_tsg)))
        logger.info(log_str)
    else:
        pred_onco = result_df[result_df['oncogene q-value']<=.1].index.to_series()
        novel_onco = result_df[(result_df['oncogene q-value']<=.1) & (result_df['training list class']!=_utils.onco_label)].index.to_series()
        pred_tsg = result_df[result_df['tsg q-value']<=.1].index.to_series()
        novel_tsg = result_df[(result_df['tsg q-value']<=.1) & (result_df['training list class']!=_utils.tsg_label)].index.to_series()
        pred_driver = result_df[result_df['driver q-value']<=.1].index.to_series()
        pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None)
        novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None)
        pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None)
        novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None)
        log_str = ('Random forest significance test: {0} ({1} novel) oncogenes, '
                   '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco),
                                                len(pred_tsg), len(novel_tsg)))
        logger.info(log_str)

    # only plot if matplotlib
    try:
        # plot r random forest results
        plot_data.prob_scatter(result_df,
                               plot_path=_utils.clf_plot_dir + cfg_opts['rrand_forest_plot'],
                               title='Sub-sampled Random Forest Predictions')
        plot_data.prob_kde(result_df,
                           col_name='oncogene score',
                           save_path=_utils.clf_plot_dir + cfg_opts['onco_kde_rrand_forest'],
                           title='Distribution of Oncogene Scores (sub-sampled random forest)')
        plot_data.prob_kde(result_df,
                           col_name='tsg score',
                           save_path=_utils.clf_plot_dir + cfg_opts['tsg_kde_rrand_forest'],
                           title='Distribution of TSG Scores (sub-sampled random forest)')
        logger.info('Finished running sub-sampled Random Forest')

        # dummy classifier, predict most frequent
        logger.debug('Running Dummy Classifier. . .')
        dclf = DummyClf(df,
                        strategy='most_frequent',
                        min_ct=minimum_ct,
                        weight=False)
        dclf.kfold_validation()
        dclf_onco_tpr, dclf_onco_fpr, dclf_onco_mean_roc_auc = dclf.get_onco_roc_metrics()
        dclf_onco_precision, dclf_onco_recall, dclf_onco_mean_pr_auc = dclf.get_onco_pr_metrics()
        dclf_tsg_tpr, dclf_tsg_fpr, dclf_tsg_mean_roc_auc = dclf.get_tsg_roc_metrics()
        dclf_tsg_precision, dclf_tsg_recall, dclf_tsg_mean_pr_auc = dclf.get_tsg_pr_metrics()
        dclf_driver_tpr, dclf_driver_fpr, dclf_driver_mean_roc_auc = dclf.get_driver_roc_metrics()
        logger.debug('Finished dummy classifier.')

        # plot oncogene roc figure
        rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_roc_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_roc_auc
        rrclf_onco_mean_tpr = np.mean(rrclf_onco_tpr, axis=0)
        dclf_onco_mean_tpr = np.mean(dclf_onco_tpr, axis=0)
        df = pd.DataFrame({
                        rrandom_forest_str: rrclf_onco_mean_tpr,
                        dummy_str: dclf_onco_mean_tpr},
                        index=rrclf_onco_fpr)
        line_style = {dummy_str: '--',
                      rrandom_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_oncogene']
        plot_data.receiver_operator_curve(df, save_path, line_style)

        # plot tsg roc figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_roc_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_roc_auc
        rrclf_tsg_mean_tpr = np.mean(rrclf_tsg_tpr, axis=0)
        dclf_tsg_mean_tpr = np.mean(dclf_tsg_tpr, axis=0)
        df = pd.DataFrame({r_random_forest_str: rrclf_tsg_mean_tpr,
                           dummy_str: dclf_tsg_mean_tpr},
                          index=rrclf_tsg_fpr)
        line_style = {dummy_str: '--',
                      r_random_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_tsg']
        plot_data.receiver_operator_curve(df, save_path, line_style)

        # plot driver roc figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_roc_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_driver_mean_roc_auc
        rrclf_driver_mean_tpr = np.mean(rrclf_driver_tpr, axis=0)
        dclf_driver_mean_tpr = np.mean(dclf_driver_tpr, axis=0)
        df = pd.DataFrame({r_random_forest_str: rrclf_driver_mean_tpr,
                           dummy_str: dclf_driver_mean_tpr},
                          index=rrclf_driver_fpr)
        line_style = {dummy_str: '--',
                      r_random_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_driver']
        plot_data.receiver_operator_curve(df, save_path, line_style)

        # plot oncogene pr figure
        rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_pr_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_pr_auc
        rrclf_onco_mean_precision = np.mean(rrclf_onco_precision, axis=0)
        dclf_onco_mean_precision = np.mean(dclf_onco_precision, axis=0)
        df = pd.DataFrame({rrandom_forest_str: rrclf_onco_mean_precision,},
                          index=rrclf_onco_recall)
        line_style = {dummy_str: '--',
                      rrandom_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_oncogene']
        plot_data.precision_recall_curve(df, save_path, line_style,
                                        #sem_df,
                                        title='Oncogene Precision-Recall Curve')

        # plot tsg pr figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_pr_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_pr_auc
        rrclf_tsg_mean_precision = np.mean(rrclf_tsg_precision, axis=0)
        dclf_tsg_mean_precision = np.mean(dclf_tsg_precision, axis=0)
        df = pd.DataFrame({
                        r_random_forest_str: rrclf_tsg_mean_precision,
                        },
                        index=rrclf_tsg_recall)
        line_style = {dummy_str: '--',
                    r_random_forest_str: '-',
                    }
        save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_tsg']
        plot_data.precision_recall_curve(df, save_path, line_style,
                                        title='TSG Precision-Recall Curve')

        # plot driver gene pr figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_pr_auc
        rrclf_driver_mean_precision = np.mean(rrclf_driver_precision, axis=0)
        df = pd.DataFrame({
                        r_random_forest_str: rrclf_driver_mean_precision,
                        },
                        index=rrclf_driver_recall)
        line_style = {dummy_str: '--',
                    r_random_forest_str: '-',
                    }
        save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_driver']
        plot_data.precision_recall_curve(df, save_path, line_style,
                                        title='Driver Precision-Recall Curve')

        # save performance metrics of ROC and PR AUC
        save_path = _utils.clf_result_dir + cfg_opts['performance']
        logger.info('Saving performance metrics ({0}) . . .'.format(save_path))
        metrics = [['TSG', rrclf_tsg_mean_roc_auc, rrclf_tsg_mean_pr_auc],
                ['OG', rrclf_onco_mean_roc_auc, rrclf_onco_mean_pr_auc],
                ['Driver', rrclf_driver_mean_roc_auc, rrclf_driver_mean_pr_auc]]
        perf_df = pd.DataFrame(metrics, columns=['Type', 'ROC AUC', 'PR AUC'])
        perf_df.to_csv(save_path, sep='\t', index=False)

        # make qq plot
        qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot']
        plot_data.create_qqplots(result_df, qq_plot_path)
    except:
        pass