Пример #1
0
def main_cosmic(options):
    """Main function used to process COSMIC data."""
    # get configs
    in_opts = _utils.get_input_config('classifier')
    out_opts = _utils.get_output_config('features')
    count_opts = _utils.get_output_config('feature_matrix')
    # result_opts = _utils.get_input_config('result')
    db_cfg = _utils.get_db_config('2020plus')

    # get mutations
    conn = sqlite3.connect(db_cfg['db'])
    sql = ("SELECT Gene, Protein_Change as AminoAcid, "
            "       DNA_Change as Nucleotide, "
            "       Variant_Classification, "
            "       Tumor_Sample, Tumor_Type "
            "FROM mutations")
    mut_df = psql.frame_query(sql, con=conn)
    conn.close()

    # get features for classification
    all_features = futils.generate_features(mut_df, options)

    # save features to text file
    cols = all_features.columns.tolist()
    new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:]
    all_features = all_features[new_order]  # make the gene name the first column
    out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output']
    all_features.to_csv(out_path, sep='\t', index=False)
Пример #2
0
def main_cosmic(options):
    """Main function used to process COSMIC data."""
    # get configs
    in_opts = _utils.get_input_config('classifier')
    out_opts = _utils.get_output_config('features')
    count_opts = _utils.get_output_config('feature_matrix')
    # result_opts = _utils.get_input_config('result')
    db_cfg = _utils.get_db_config('2020plus')

    # get mutations
    conn = sqlite3.connect(db_cfg['db'])
    sql = ("SELECT Gene, Protein_Change as AminoAcid, "
           "       DNA_Change as Nucleotide, "
           "       Variant_Classification, "
           "       Tumor_Sample, Tumor_Type "
           "FROM mutations")
    mut_df = psql.frame_query(sql, con=conn)
    conn.close()

    # get features for classification
    all_features = futils.generate_features(mut_df, options)

    # save features to text file
    cols = all_features.columns.tolist()
    new_order = ['gene'
                 ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:]
    all_features = all_features[
        new_order]  # make the gene name the first column
    out_path = _utils.save_dir + in_opts['gene_features'] if not options[
        'output'] else options['output']
    all_features.to_csv(out_path, sep='\t', index=False)
Пример #3
0
def main(cli_opts):
    cfg_opts = _utils.get_output_config('classifier')
    in_opts = _utils.get_input_config('classifier')
    minimum_ct = cli_opts['min_count']

    # get path to features used for classification
    if cli_opts['features']:
        feature_path = cli_opts['features']
    else:
        feature_path = _utils.save_dir + in_opts['gene_feature']

    # read in features
    df = pd.read_csv(feature_path,
                     sep='\t', index_col=0)

    logger.info('Training R\'s Random forest . . .')
    rrclf = RRandomForest(df,
                          other_sample_ratio=cli_opts['other_ratio'],
                          driver_sample=cli_opts['driver_rate'],
                          ntrees=cli_opts['ntrees'],
                          seed=cli_opts['random_seed'])
    # train on entire data
    if cli_opts['cv']:
        rrclf.train_cv()
    else:
        rrclf.train()
    logger.info('Finished training.')
    logger.info('Saving classifier to . . .')
    if cli_opts['cv']:
        rrclf.clf.save_cv(cli_opts['output'])
    else:
        rrclf.clf.save(cli_opts['output'])
    logger.info('Finished saving classifier.')
Пример #4
0
def main(cli_opts):
    cfg_opts = _utils.get_output_config('classifier')
    in_opts = _utils.get_input_config('classifier')
    minimum_ct = cli_opts['min_count']

    # get path to features used for classification
    if cli_opts['features']:
        feature_path = cli_opts['features']
    else:
        feature_path = _utils.save_dir + in_opts['gene_feature']

    # read in features
    df = pd.read_csv(feature_path, sep='\t', index_col=0)

    logger.info('Training R\'s Random forest . . .')
    rrclf = RRandomForest(df,
                          other_sample_ratio=cli_opts['other_ratio'],
                          driver_sample=cli_opts['driver_rate'],
                          ntrees=cli_opts['ntrees'],
                          seed=cli_opts['random_seed'])
    # train on entire data
    if cli_opts['cv']:
        rrclf.train_cv()
    else:
        rrclf.train()
    logger.info('Finished training.')
    logger.info('Saving classifier to . . .')
    if cli_opts['cv']:
        rrclf.clf.save_cv(cli_opts['output'])
    else:
        rrclf.clf.save(cli_opts['output'])
    logger.info('Finished saving classifier.')
Пример #5
0
def main(hypermutator_count, mut_path, db_path, no_cosmic_flag, opts):
    """Concatenates all the mutation data from tab delmited files in
    the cosmic directory. Next, saves the results to a sqlite db.

    Parameters
    ----------
    hypermutator_count : int
        remove samples with too many mutations
    mut_path : str
        Either path to directory containing contents of COSMIC's
        genes.tgz file or decompressed CosmicMutantExport.tsv.
        If empty string, just use path from config file.
    db_path : str
        path to save sqlite database. If string is empty,
        use path from config.
    no_cosmic_flag : bool
        indicates not to use cosmic mutations
    """
    # get input/output configurations
    in_opts = _utils.get_input_config('input')
    cosmic_path = in_opts['cosmic_path']
    out_opts = _utils.get_output_config('gene_tsv')
    out_path = out_opts['gene_tsv']
    cnv_path = out_opts['cnv_tsv']
    db_opts = _utils.get_db_config('2020plus')
    out_db = db_opts['db']

    # check if user specifies non standard db path
    out_db = db_path if db_path else out_db

    # save info into a txt file and sqlite3 database
    if not no_cosmic_flag:
        cosmic_path = mut_path if mut_path else cosmic_path
        if os.path.isdir(cosmic_path):
            # concatenate all gene files
            concatenate_genes(out_path, cosmic_path)
            # save database
            save_db(hypermutator_count,
                    out_path,
                    out_db,
                    is_genes_tgz=True,
                    only_genome_wide=opts['only_genome_wide'],
                    use_unknown_status=opts['use_unknown_status'])
        elif os.path.isfile(cosmic_path):
            # save database
            save_db(hypermutator_count,
                    cosmic_path,
                    out_db,
                    is_genes_tgz=False,
                    only_genome_wide=opts['only_genome_wide'],
                    use_unknown_status=opts['use_unknown_status'])
        else:
            raise ValueError('Please specify a vlid path to COSMIC data')
    else:
        # create an empty table if cosmic not wanted
        create_empty_cosmic_mutation_table(out_db)
Пример #6
0
def sample_boxplot(pred_onco,
                   pred_tsg,
                   pred_driver,
                   save_path_type,
                   save_path_driver,
                   xlabel='',
                   ylabel='',
                   title=''):
    """Create a box plot for distribution of percentage of tumor samples
    containing a non-silent mutation in different categories of genes (ie
    oncogenes, tsgs, and drivers).

    Parameters
    ----------
    pred_onco : list
        list of genes predicted as oncogenes
    pred_tsg : list
        list of genes predicted as tsgs
    pred_driver : list
        list of genes predicted as drivers
    save_path_type : str
        path to save figure for comparing oncogenes, tsgs, and other
    save_path_driver : str
        path to save figure for comparing drivers vs other
    xlabel : str
        x-axis label
    ylabel : str
        y-axis label
    title : str
        title of figures
    """
    cfg = _utils.get_output_config('sample')
    df = pd.read_csv(_utils.result_dir + cfg['max_gene_pct_sample_out'],
                     sep='\t',
                     index_col=0)
    df['Predicted Type'] = [("oncogene" if g in pred_onco else "other")
                            for g in df.index]
    df.ix[pred_tsg, 'Predicted Type'] = 'TSG'
    df['Predicted Driver'] = [("driver" if g in pred_driver else "other")
                              for g in df.index]

    # set figure labels
    if not xlabel:
        xlabel = 'Predicted Type'
    if not ylabel:
        ylabel = 'Maximum Pct of Samples for a Tumor Type'
    if not title:
        title = 'Percentage of Samples with Non-Silent Mutation'

    # plot with oncogenes, tsgs, and other
    myplt.boxplot(df,
                  by='Predicted Type',
                  column=['all mutation sample pct', 'non-silent sample pct'],
                  save_path=save_path_type,
                  xlabel=xlabel,
                  ylabel=ylabel,
                  title=title)

    # plot with drivers vs other
    myplt.boxplot(df,
                  by='Predicted Driver',
                  column=['all mutation sample pct', 'non-silent sample pct'],
                  save_path=save_path_driver,
                  xlabel=xlabel,
                  ylabel=ylabel,
                  title=title)
Пример #7
0
def retrieve_gene_features(conn, opts, get_entropy=True):
    """Retrieve gene information from the gene_features table.

    See the gene_features module to understand the gene_features
    database table.

    Parameters
    ----------
    conn : mysql/sqlite connection
        connection to db with gene_features table
    options : dict
        options for getting info
    get_entropy : bool
        option to togle the use of entropy features.
        Since entropy features are read from a file in this function, it may
        induce a not necessary dependency on previously running commands.
        To avoid this, set get_entropy=False and then compute entropy features
        separately.

    Returns
    -------
    df : pd.dataframe
        dataframe of gene lengths
    """
    logger.info('Retrieving features of genes . . .')

    selected_cols = ['gene']

    # retrieve more features if specified by command line
    if opts['gene_length']:
        selected_cols.append('gene_length')
    if opts['mutation_rate']:
        selected_cols.append('noncoding_mutation_rate')
    if opts['replication_time']:
        selected_cols.append('replication_time')
    if opts['expression']:
        selected_cols.append('expression_CCLE as expression')
    if opts['hic']:
        selected_cols.append('HiC_compartment')
    if opts['betweeness']:
        selected_cols.append('gene_betweeness')
    if opts['degree']:
        selected_cols.append('gene_degree')

    # get info from gene_features table
    logger.info(
        'Retrieving gene feature information from gene_features table . . . ')
    sql = "SELECT %s FROM gene_features" % ', '.join(selected_cols)
    df = psql.frame_query(sql, conn)
    df = df.set_index('gene')
    df['gene'] = df.index
    logger.info('Finished retrieving gene features from gene_features table.')

    # fill graph stats with zeros if gene not in Biogrid
    if 'gene_betweeness' in df.columns:
        df['gene_betweeness'] = df['gene_betweeness'].fillna(0)
    if 'gene_degree' in df.columns:
        df['gene_degree'] = df['gene_degree'].fillna(0)

    # get position entropy features
    if get_entropy:
        entropy_cfg = _utils.get_output_config('position_entropy')
        mutation_pos_entropy = pd.read_csv(_utils.result_dir +
                                           entropy_cfg['mutation_pos_entropy'],
                                           sep='\t',
                                           index_col=0)
        missense_pos_entropy = pd.read_csv(_utils.result_dir +
                                           entropy_cfg['missense_pos_entropy'],
                                           sep='\t',
                                           index_col=0)
        #df['mutation position entropy'] = mutation_pos_entropy['mutation position entropy']
        #df['pct of uniform mutation entropy'] = mutation_pos_entropy['pct of uniform mutation entropy']
        df['missense position entropy'] = missense_pos_entropy[
            'missense position entropy']
        df['pct of uniform missense entropy'] = missense_pos_entropy[
            'pct of uniform missense entropy']

    return df
Пример #8
0
def main(cli_opts):
    cfg_opts = _utils.get_output_config('classifier')
    in_opts = _utils.get_input_config('classifier')
    minimum_ct = cli_opts['min_count']

    # get path to features used for classification
    if cli_opts['features']:
        feature_path = cli_opts['features']
    else:
        feature_path = _utils.save_dir + in_opts['gene_feature']

    # read in null distribution p-values
    if not cli_opts['simulated'] and cli_opts['null_distribution']:
        null_pvals = pd.read_csv(cli_opts['null_distribution'], sep='\t',
                                 index_col=0)
    else:
        null_pvals = None

    # use trained classifier if provided
    if cli_opts['trained_classifier']:
        # read in features
        df = pd.read_csv(feature_path, sep='\t', index_col=0)

        logger.info('Running Random forest . . .')

        # initialize R's random forest
        rrclf = RRandomForest(df,
                              other_sample_ratio=cli_opts['other_ratio'],
                              driver_sample=cli_opts['driver_rate'],
                              ntrees=cli_opts['ntrees'],
                              seed=cli_opts['random_seed'])
        # load classifier depending on whether it uses CV
        is_cv = cli_opts['cv']
        if is_cv:
            rrclf.clf.load_cv(cli_opts['trained_classifier'])
        else:
            rrclf.clf.load(cli_opts['trained_classifier'])

        if cli_opts['simulated']:
            # do classification
            result_df = trained_rand_forest_pred(rrclf, df, None, null_pvals, is_cv)

            # driver scores
            driver_score_cts = result_df['driver score'].value_counts()
            driver_score_cts = driver_score_cts.sort_index(ascending=False)
            driver_score_cum_cts = driver_score_cts.cumsum()
            driver_score_pvals = driver_score_cum_cts / float(driver_score_cts.sum())

            # oncogene scores
            onco_score_cts = result_df['oncogene score'].value_counts()
            onco_score_cts = onco_score_cts.sort_index(ascending=False)
            onco_score_cum_cts = onco_score_cts.cumsum()
            onco_score_pvals = onco_score_cum_cts / float(onco_score_cts.sum())

            # tsg score
            tsg_score_cts = result_df['tsg score'].value_counts()
            tsg_score_cts = tsg_score_cts.sort_index(ascending=False)
            tsg_score_cum_cts = tsg_score_cts.cumsum()
            tsg_score_pvals = tsg_score_cum_cts / float(tsg_score_cts.sum())

            # construct null p-value score distribution
            score_ix = set(driver_score_pvals.index) | set(onco_score_pvals.index) | set(tsg_score_pvals.index)
            score_pvals = pd.DataFrame(index=list(score_ix))
            score_pvals['oncogene p-value'] = onco_score_pvals
            score_pvals['tsg p-value'] = tsg_score_pvals
            score_pvals['driver p-value'] = driver_score_pvals
            score_pvals = score_pvals.sort_index(ascending=False)

            score_pvals.to_csv(cli_opts['null_distribution'], sep='\t',
                               index_label='score')
        else:
            # do classification
            pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred']
            logger.info('Saving results to {0}'.format(pred_results_path))
            result_df = trained_rand_forest_pred(rrclf, df, pred_results_path,
                                                 null_pvals, is_cv)
            result_df.to_csv(pred_results_path, sep='\t')

            # create qq plot
            try:
                qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot']
                plot_data.create_qqplots(result_df, qq_plot_path)
            except:
                pass

        logger.info('Finished classification.')
        return

    df = pd.read_csv(feature_path, sep='\t', index_col=0)

    # R's random forest
    logger.info('Running Random forest . . .')
    # initialize R's random forest
    rrclf = RRandomForest(df,
                          other_sample_ratio=cli_opts['other_ratio'],
                          driver_sample=cli_opts['driver_rate'],
                          ntrees=cli_opts['ntrees'],
                          seed=cli_opts['random_seed'])

    # analyze classification metrics
    rrclf.kfold_validation()
    rrclf_onco_tpr, rrclf_onco_fpr, rrclf_onco_mean_roc_auc = rrclf.get_onco_roc_metrics()
    rrclf_onco_precision, rrclf_onco_recall, rrclf_onco_mean_pr_auc = rrclf.get_onco_pr_metrics()
    rrclf_tsg_tpr, rrclf_tsg_fpr, rrclf_tsg_mean_roc_auc = rrclf.get_tsg_roc_metrics()
    rrclf_tsg_precision, rrclf_tsg_recall, rrclf_tsg_mean_pr_auc = rrclf.get_tsg_pr_metrics()
    rrclf_driver_precision, rrclf_driver_recall, rrclf_driver_mean_pr_auc = rrclf.get_driver_pr_metrics()
    rrclf_driver_tpr, rrclf_driver_fpr, rrclf_driver_mean_roc_auc = rrclf.get_driver_roc_metrics()

    # skip if no matplotlib
    try:
        # plot feature importance
        mean_df = rrclf.mean_importance
        std_df = rrclf.std_importance
        feat_path = _utils.clf_plot_dir + cfg_opts['r_feature_importance_plot']
        plot_data.feature_importance_barplot(mean_df, std_df, feat_path)
    except:
        pass

    # run predictions using R's random forest
    pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred']
    result_df = rand_forest_pred(rrclf, df, result_path=pred_results_path,
                                 null_dist=null_pvals)

    # save a list of oncogenes/tsgs in separate files
    if null_pvals is None:
        pred_onco = result_df[result_df['majority vote class']==_utils.onco_label].index.to_series()
        novel_onco = result_df[(result_df['majority vote class']==_utils.onco_label) & (result_df['training list class']!=_utils.onco_label)].index.to_series()
        pred_tsg = result_df[result_df['majority vote class']==_utils.tsg_label].index.to_series()
        novel_tsg = result_df[(result_df['majority vote class']==_utils.tsg_label) & (result_df['training list class']!=_utils.tsg_label)].index.to_series()
        pred_driver = result_df[result_df['majority vote cancer gene']==1].index.to_series()
        pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None)
        novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None)
        pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None)
        novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None)
        log_str = ('Majority vote Random forest: {0} ({1} novel) oncogenes, '
                   '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco),
                                                len(pred_tsg), len(novel_tsg)))
        logger.info(log_str)
    else:
        pred_onco = result_df[result_df['oncogene q-value']<=.1].index.to_series()
        novel_onco = result_df[(result_df['oncogene q-value']<=.1) & (result_df['training list class']!=_utils.onco_label)].index.to_series()
        pred_tsg = result_df[result_df['tsg q-value']<=.1].index.to_series()
        novel_tsg = result_df[(result_df['tsg q-value']<=.1) & (result_df['training list class']!=_utils.tsg_label)].index.to_series()
        pred_driver = result_df[result_df['driver q-value']<=.1].index.to_series()
        pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None)
        novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None)
        pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None)
        novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None)
        log_str = ('Random forest significance test: {0} ({1} novel) oncogenes, '
                   '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco),
                                                len(pred_tsg), len(novel_tsg)))
        logger.info(log_str)

    # only plot if matplotlib
    try:
        # plot r random forest results
        plot_data.prob_scatter(result_df,
                               plot_path=_utils.clf_plot_dir + cfg_opts['rrand_forest_plot'],
                               title='Sub-sampled Random Forest Predictions')
        plot_data.prob_kde(result_df,
                           col_name='oncogene score',
                           save_path=_utils.clf_plot_dir + cfg_opts['onco_kde_rrand_forest'],
                           title='Distribution of Oncogene Scores (sub-sampled random forest)')
        plot_data.prob_kde(result_df,
                           col_name='tsg score',
                           save_path=_utils.clf_plot_dir + cfg_opts['tsg_kde_rrand_forest'],
                           title='Distribution of TSG Scores (sub-sampled random forest)')
        logger.info('Finished running sub-sampled Random Forest')

        # dummy classifier, predict most frequent
        logger.debug('Running Dummy Classifier. . .')
        dclf = DummyClf(df,
                        strategy='most_frequent',
                        min_ct=minimum_ct,
                        weight=False)
        dclf.kfold_validation()
        dclf_onco_tpr, dclf_onco_fpr, dclf_onco_mean_roc_auc = dclf.get_onco_roc_metrics()
        dclf_onco_precision, dclf_onco_recall, dclf_onco_mean_pr_auc = dclf.get_onco_pr_metrics()
        dclf_tsg_tpr, dclf_tsg_fpr, dclf_tsg_mean_roc_auc = dclf.get_tsg_roc_metrics()
        dclf_tsg_precision, dclf_tsg_recall, dclf_tsg_mean_pr_auc = dclf.get_tsg_pr_metrics()
        dclf_driver_tpr, dclf_driver_fpr, dclf_driver_mean_roc_auc = dclf.get_driver_roc_metrics()
        logger.debug('Finished dummy classifier.')

        # plot oncogene roc figure
        rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_roc_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_roc_auc
        rrclf_onco_mean_tpr = np.mean(rrclf_onco_tpr, axis=0)
        dclf_onco_mean_tpr = np.mean(dclf_onco_tpr, axis=0)
        df = pd.DataFrame({
                        rrandom_forest_str: rrclf_onco_mean_tpr,
                        dummy_str: dclf_onco_mean_tpr},
                        index=rrclf_onco_fpr)
        line_style = {dummy_str: '--',
                      rrandom_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_oncogene']
        plot_data.receiver_operator_curve(df, save_path, line_style)

        # plot tsg roc figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_roc_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_roc_auc
        rrclf_tsg_mean_tpr = np.mean(rrclf_tsg_tpr, axis=0)
        dclf_tsg_mean_tpr = np.mean(dclf_tsg_tpr, axis=0)
        df = pd.DataFrame({r_random_forest_str: rrclf_tsg_mean_tpr,
                           dummy_str: dclf_tsg_mean_tpr},
                          index=rrclf_tsg_fpr)
        line_style = {dummy_str: '--',
                      r_random_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_tsg']
        plot_data.receiver_operator_curve(df, save_path, line_style)

        # plot driver roc figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_roc_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_driver_mean_roc_auc
        rrclf_driver_mean_tpr = np.mean(rrclf_driver_tpr, axis=0)
        dclf_driver_mean_tpr = np.mean(dclf_driver_tpr, axis=0)
        df = pd.DataFrame({r_random_forest_str: rrclf_driver_mean_tpr,
                           dummy_str: dclf_driver_mean_tpr},
                          index=rrclf_driver_fpr)
        line_style = {dummy_str: '--',
                      r_random_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_driver']
        plot_data.receiver_operator_curve(df, save_path, line_style)

        # plot oncogene pr figure
        rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_pr_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_pr_auc
        rrclf_onco_mean_precision = np.mean(rrclf_onco_precision, axis=0)
        dclf_onco_mean_precision = np.mean(dclf_onco_precision, axis=0)
        df = pd.DataFrame({rrandom_forest_str: rrclf_onco_mean_precision,},
                          index=rrclf_onco_recall)
        line_style = {dummy_str: '--',
                      rrandom_forest_str: '-',
                     }
        save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_oncogene']
        plot_data.precision_recall_curve(df, save_path, line_style,
                                        #sem_df,
                                        title='Oncogene Precision-Recall Curve')

        # plot tsg pr figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_pr_auc
        dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_pr_auc
        rrclf_tsg_mean_precision = np.mean(rrclf_tsg_precision, axis=0)
        dclf_tsg_mean_precision = np.mean(dclf_tsg_precision, axis=0)
        df = pd.DataFrame({
                        r_random_forest_str: rrclf_tsg_mean_precision,
                        },
                        index=rrclf_tsg_recall)
        line_style = {dummy_str: '--',
                    r_random_forest_str: '-',
                    }
        save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_tsg']
        plot_data.precision_recall_curve(df, save_path, line_style,
                                        title='TSG Precision-Recall Curve')

        # plot driver gene pr figure
        r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_pr_auc
        rrclf_driver_mean_precision = np.mean(rrclf_driver_precision, axis=0)
        df = pd.DataFrame({
                        r_random_forest_str: rrclf_driver_mean_precision,
                        },
                        index=rrclf_driver_recall)
        line_style = {dummy_str: '--',
                    r_random_forest_str: '-',
                    }
        save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_driver']
        plot_data.precision_recall_curve(df, save_path, line_style,
                                        title='Driver Precision-Recall Curve')

        # save performance metrics of ROC and PR AUC
        save_path = _utils.clf_result_dir + cfg_opts['performance']
        logger.info('Saving performance metrics ({0}) . . .'.format(save_path))
        metrics = [['TSG', rrclf_tsg_mean_roc_auc, rrclf_tsg_mean_pr_auc],
                ['OG', rrclf_onco_mean_roc_auc, rrclf_onco_mean_pr_auc],
                ['Driver', rrclf_driver_mean_roc_auc, rrclf_driver_mean_pr_auc]]
        perf_df = pd.DataFrame(metrics, columns=['Type', 'ROC AUC', 'PR AUC'])
        perf_df.to_csv(save_path, sep='\t', index=False)

        # make qq plot
        qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot']
        plot_data.create_qqplots(result_df, qq_plot_path)
    except:
        pass
Пример #9
0
def sample_boxplot(pred_onco,
                   pred_tsg,
                   pred_driver,
                   save_path_type,
                   save_path_driver,
                   xlabel='',
                   ylabel='',
                   title=''):
    """Create a box plot for distribution of percentage of tumor samples
    containing a non-silent mutation in different categories of genes (ie
    oncogenes, tsgs, and drivers).

    Parameters
    ----------
    pred_onco : list
        list of genes predicted as oncogenes
    pred_tsg : list
        list of genes predicted as tsgs
    pred_driver : list
        list of genes predicted as drivers
    save_path_type : str
        path to save figure for comparing oncogenes, tsgs, and other
    save_path_driver : str
        path to save figure for comparing drivers vs other
    xlabel : str
        x-axis label
    ylabel : str
        y-axis label
    title : str
        title of figures
    """
    cfg = _utils.get_output_config('sample')
    df = pd.read_csv(_utils.result_dir + cfg['max_gene_pct_sample_out'],
                     sep='\t', index_col=0)
    df['Predicted Type'] = [("oncogene" if g in pred_onco else "other") for g in df.index]
    df.ix[pred_tsg, 'Predicted Type'] = 'TSG'
    df['Predicted Driver'] = [("driver" if g in pred_driver else "other") for g in df.index]

    # set figure labels
    if not xlabel:
        xlabel = 'Predicted Type'
    if not ylabel:
        ylabel = 'Maximum Pct of Samples for a Tumor Type'
    if not title:
        title = 'Percentage of Samples with Non-Silent Mutation'

    # plot with oncogenes, tsgs, and other
    myplt.boxplot(df,
                  by='Predicted Type',
                  column=['all mutation sample pct', 'non-silent sample pct'],
                  save_path=save_path_type,
                  xlabel=xlabel,
                  ylabel=ylabel,
                  title=title)

    # plot with drivers vs other
    myplt.boxplot(df,
                  by='Predicted Driver',
                  column=['all mutation sample pct', 'non-silent sample pct'],
                  save_path=save_path_driver,
                  xlabel=xlabel,
                  ylabel=ylabel,
                  title=title)
Пример #10
0
def retrieve_gene_features(conn, opts,
                           get_entropy=True):
    """Retrieve gene information from the gene_features table.

    See the gene_features module to understand the gene_features
    database table.

    Parameters
    ----------
    conn : mysql/sqlite connection
        connection to db with gene_features table
    options : dict
        options for getting info
    get_entropy : bool
        option to togle the use of entropy features.
        Since entropy features are read from a file in this function, it may
        induce a not necessary dependency on previously running commands.
        To avoid this, set get_entropy=False and then compute entropy features
        separately.

    Returns
    -------
    df : pd.dataframe
        dataframe of gene lengths
    """
    logger.info('Retrieving features of genes . . .')

    selected_cols = ['gene']

    # retrieve more features if specified by command line
    if opts['gene_length']:
        selected_cols.append('gene_length')
    if opts['mutation_rate']:
        selected_cols.append('noncoding_mutation_rate')
    if opts['replication_time']:
        selected_cols.append('replication_time')
    if opts['expression']:
        selected_cols.append('expression_CCLE as expression')
    if opts['hic']:
        selected_cols.append('HiC_compartment')
    if opts['betweeness']:
        selected_cols.append('gene_betweeness')
    if opts['degree']:
        selected_cols.append('gene_degree')

    # get info from gene_features table
    logger.info('Retrieving gene feature information from gene_features table . . . ')
    sql = "SELECT %s FROM gene_features" % ', '.join(selected_cols)
    df = psql.frame_query(sql, conn)
    df = df.set_index('gene')
    df['gene'] = df.index
    logger.info('Finished retrieving gene features from gene_features table.')

    # fill graph stats with zeros if gene not in Biogrid
    if 'gene_betweeness' in df.columns:
        df['gene_betweeness'] = df['gene_betweeness'].fillna(0)
    if 'gene_degree' in df.columns:
        df['gene_degree'] = df['gene_degree'].fillna(0)

    # get position entropy features
    if get_entropy:
        entropy_cfg = _utils.get_output_config('position_entropy')
        mutation_pos_entropy = pd.read_csv(_utils.result_dir + entropy_cfg['mutation_pos_entropy'],
                                           sep='\t', index_col=0)
        missense_pos_entropy = pd.read_csv(_utils.result_dir + entropy_cfg['missense_pos_entropy'],
                                           sep='\t', index_col=0)
        #df['mutation position entropy'] = mutation_pos_entropy['mutation position entropy']
        #df['pct of uniform mutation entropy'] = mutation_pos_entropy['pct of uniform mutation entropy']
        df['missense position entropy'] = missense_pos_entropy['missense position entropy']
        df['pct of uniform missense entropy'] = missense_pos_entropy['pct of uniform missense entropy']

    return df