示例#1
0
def main(opts, mutation_df=None, frameshift_df=None):
    # get output file
    myoutput_path = opts['output']
    opts['output'] = ''

    # perform randomization-based test
    result_df = rt.main(opts, mutation_df)

    # clean up p-values for combined p-value calculation
    if opts['kind'] == 'tsg':
        p_val_col = 'inactivating p-value'
        q_val_col = 'inactivating BH q-value'
    elif opts['kind'] == 'effect':
        p_val_col = 'entropy-on-effect p-value'
        q_val_col = 'entropy-on-effect BH q-value'
    elif opts['kind'] == 'oncogene':
        p_val_col = 'entropy p-value'
        q_val_col = 'entropy BH q-value'
    elif opts['kind'] == 'protein':
        p_val_col = 'normalized graph-smoothed position entropy p-value'
        q_val_col = 'normalized graph-smoothed position entropy BH q-value'
    result_df[p_val_col] = result_df[p_val_col].fillna(1)
    result_df[q_val_col] = result_df[q_val_col].fillna(1)

    if opts['kind'] == 'tsg':
        # drop genes that never occur
        if opts['kind'] == 'tsg' or opts['kind'] == 'effect':
            no_ssvs = (result_df['Total SNV Mutations'] == 0)
            result_df = result_df[~no_ssvs]

        result_df = result_df.sort_values(by=p_val_col)
    elif opts['kind'] == 'oncogene':
        # get FDR
        result_df = result_df[result_df['Total Mutations'] > 0]
        result_df['entropy BH q-value'] = mypval.bh_fdr(
            result_df['entropy p-value'])

        # combine p-values
        result_df['tmp entropy p-value'] = result_df['entropy p-value']
        result_df['tmp vest p-value'] = result_df['vest p-value']
        result_df.loc[result_df['entropy p-value'] == 0,
                      'tmp entropy p-value'] = 1. / opts['num_iterations']
        result_df.loc[result_df['vest p-value'] == 0,
                      'tmp vest p-value'] = 1. / opts['num_iterations']
        result_df['combined p-value'] = result_df[[
            'tmp entropy p-value', 'tmp vest p-value'
        ]].apply(mypval.fishers_method, axis=1)
        result_df['combined BH q-value'] = mypval.bh_fdr(
            result_df['combined p-value'])
        del result_df['tmp vest p-value']
        del result_df['tmp entropy p-value']

    if myoutput_path:
        # write output if specified
        result_df.to_csv(myoutput_path, sep='\t', index=False)

    result_df = result_df.set_index('gene', drop=False)

    return result_df
def main(opts,
         mutation_df=None,
         frameshift_df=None):
    # get output file
    myoutput_path = opts['output']
    opts['output'] = ''

    # perform randomization-based test
    result_df = rt.main(opts, mutation_df)

    # clean up p-values for combined p-value calculation
    if opts['kind'] == 'tsg':
        p_val_col = 'inactivating p-value'
        q_val_col = 'inactivating BH q-value'
    elif opts['kind'] == 'effect':
        p_val_col = 'entropy-on-effect p-value'
        q_val_col = 'entropy-on-effect BH q-value'
    elif opts['kind'] == 'oncogene':
        p_val_col = 'entropy p-value'
        q_val_col = 'entropy BH q-value'
    elif opts['kind'] == 'protein':
        p_val_col = 'normalized graph-smoothed position entropy p-value'
        q_val_col = 'normalized graph-smoothed position entropy BH q-value'
    elif opts['kind'] == 'hotmaps1d':
        p_val_col = 'p-value'
        q_val_col = 'q-value'
    result_df[p_val_col] = result_df[p_val_col].fillna(1)
    result_df[q_val_col] = result_df[q_val_col].fillna(1)

    if opts['kind'] == 'tsg':
        # drop genes that never occur
        if opts['kind'] == 'tsg' or opts['kind'] == 'effect':
            no_ssvs = (result_df['Total SNV Mutations']==0)
            result_df = result_df[~no_ssvs]

        result_df = result_df.sort_values(by=p_val_col)
    elif opts['kind'] == 'oncogene':
        # get FDR
        result_df = result_df[result_df['Total Mutations']>0]
        result_df['entropy BH q-value'] = mypval.bh_fdr(result_df['entropy p-value'])

        # combine p-values
        result_df['tmp entropy p-value'] = result_df['entropy p-value']
        result_df['tmp vest p-value'] = result_df['vest p-value']
        result_df.loc[result_df['entropy p-value']==0, 'tmp entropy p-value'] = 1. / opts['num_iterations']
        result_df.loc[result_df['vest p-value']==0, 'tmp vest p-value'] = 1. / opts['num_iterations']
        result_df['combined p-value'] = result_df[['tmp entropy p-value', 'tmp vest p-value']].apply(mypval.fishers_method, axis=1)
        result_df['combined BH q-value'] = mypval.bh_fdr(result_df['combined p-value'])
        del result_df['tmp vest p-value']
        del result_df['tmp entropy p-value']

    if myoutput_path:
        # write output if specified
        result_df.to_csv(myoutput_path, sep='\t', index=False)

    result_df = result_df.set_index('gene', drop=False)

    return result_df
def handle_oncogene_results(permutation_result, non_tested_genes,
                            num_permutations):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    mycols = [
        'gene', 'num recurrent', 'position entropy', 'mean vest score',
        'entropy p-value', 'vest p-value', 'Total Mutations',
        'Unmapped to Ref Tx'
    ]
    permutation_df = pd.DataFrame(permutation_result, columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['entropy BH q-value'] = mypval.bh_fdr(
        permutation_df['entropy p-value'])
    permutation_df['vest BH q-value'] = mypval.bh_fdr(
        permutation_df['vest p-value'])

    # combine p-values
    permutation_df['tmp entropy p-value'] = permutation_df['entropy p-value']
    permutation_df['tmp vest p-value'] = permutation_df['vest p-value']
    permutation_df.loc[permutation_df['entropy p-value'] == 0,
                       'tmp entropy p-value'] = 1. / num_permutations
    permutation_df.loc[permutation_df['vest p-value'] == 0,
                       'tmp vest p-value'] = 1. / num_permutations
    permutation_df['combined p-value'] = permutation_df[[
        'entropy p-value', 'vest p-value'
    ]].apply(mypval.fishers_method, axis=1)
    permutation_df['combined BH q-value'] = mypval.bh_fdr(
        permutation_df['combined p-value'])
    del permutation_df['tmp vest p-value']
    del permutation_df['tmp entropy p-value']

    # order output
    permutation_df = permutation_df.set_index(
        'gene', drop=False)  # make sure genes are indices
    permutation_df['num recurrent'] = permutation_df['num recurrent'].fillna(
        -1).astype(int)  # fix dtype isssue
    col_order = [
        'gene', 'Total Mutations', 'Unmapped to Ref Tx', 'num recurrent',
        'position entropy', 'mean vest score', 'entropy p-value',
        'vest p-value', 'combined p-value', 'entropy BH q-value',
        'vest BH q-value', 'combined BH q-value'
    ]
    permutation_df = permutation_df.sort_values(by=['combined p-value'])
    return permutation_df[col_order]
def handle_hotmaps_results(permutation_result):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    if len(permutation_result[0])  == 6:
        mycols = ['gene', 'window length', 'codon position', 'mutation count',
                  'windowed sum', 'p-value']
    else:
        mycols = ['gene', 'window length', 'codon position', 'index', 'mutation count',
                  'windowed sum', 'p-value']

    permutation_df = pd.DataFrame(permutation_result, columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['q-value'] = 1
    for w in permutation_df['window length'].unique():
        is_window = permutation_df['window length'] == w
        permutation_df.loc[is_window, 'q-value'] = mypval.bh_fdr(permutation_df.loc[is_window, 'p-value'])
    #permutation_df['q-value'] = mypval.bh_fdr(permutation_df['p-value'])

    # order output
    #permutation_df = permutation_df.set_index('gene', drop=False)  # make sure genes are indices
    col_order = mycols + ['q-value']
    permutation_df = permutation_df.sort_values(by=['window length', 'p-value'])
    return permutation_df[col_order]
def handle_effect_results(permutation_result):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    mycols = ['gene', 'num recurrent', 'num inactivating', 'entropy-on-effect',
              'entropy-on-effect p-value',
              'Total Mutations', 'Unmapped to Ref Tx']
    permutation_df = pd.DataFrame(sorted(permutation_result, key=lambda x: x[4] if x[4] is not None else 1.1),
                                  columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['entropy-on-effect BH q-value'] = mypval.bh_fdr(permutation_df['entropy-on-effect p-value'])

    # order output
    permutation_df = permutation_df.set_index('gene', drop=False)  # make sure genes are indices
    permutation_df['num recurrent'] = permutation_df['num recurrent'].fillna(-1).astype(int)  # fix dtype isssue
    col_order = ['gene', 'Total Mutations', 'Unmapped to Ref Tx',
                 'num recurrent', 'num inactivating', 'entropy-on-effect',
                 'entropy-on-effect p-value', 'entropy-on-effect BH q-value']
    return permutation_df[col_order]
def handle_protein_results(permutation_result):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    mycols = ['gene', 'num recurrent', 'normalized graph-smoothed position entropy',
              'normalized graph-smoothed position entropy p-value',
              'Total Mutations', 'Unmapped to Ref Tx']
    permutation_df = pd.DataFrame(permutation_result, columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['normalized graph-smoothed position entropy BH q-value'] = mypval.bh_fdr(permutation_df['normalized graph-smoothed position entropy p-value'])

    # order output
    permutation_df = permutation_df.set_index('gene', drop=False)  # make sure genes are indices
    col_order = ['gene', 'Total Mutations', 'Unmapped to Ref Tx',
                 'num recurrent',
                 'normalized graph-smoothed position entropy',
                 'normalized graph-smoothed position entropy p-value',
                 'normalized graph-smoothed position entropy BH q-value']
    permutation_df = permutation_df.sort_values(by=['normalized graph-smoothed position entropy p-value'])
    return permutation_df[col_order]
def handle_hotmaps_results(permutation_result):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    if len(permutation_result[0]) == 5:
        mycols = [
            'gene', 'codon position', 'mutation count', 'windowed sum',
            'p-value'
        ]
    else:
        mycols = [
            'gene', 'codon position', 'index', 'mutation count',
            'windowed sum', 'p-value'
        ]

    permutation_df = pd.DataFrame(permutation_result, columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['q-value'] = mypval.bh_fdr(permutation_df['p-value'])

    # order output
    #permutation_df = permutation_df.set_index('gene', drop=False)  # make sure genes are indices
    col_order = mycols + ['q-value']
    permutation_df = permutation_df.sort_values(by=['p-value'])
    return permutation_df[col_order]
def handle_oncogene_results(permutation_result, num_permutations):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    mycols = ['gene', 'num recurrent', 'position entropy',
              'mean vest score', 'entropy p-value',
              'vest p-value', 'Total Mutations', 'Unmapped to Ref Tx']
    permutation_df = pd.DataFrame(permutation_result, columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['entropy BH q-value'] = mypval.bh_fdr(permutation_df['entropy p-value'])
    permutation_df['vest BH q-value'] = mypval.bh_fdr(permutation_df['vest p-value'])

    # combine p-values
    permutation_df['tmp entropy p-value'] = permutation_df['entropy p-value']
    permutation_df['tmp vest p-value'] = permutation_df['vest p-value']
    permutation_df.loc[permutation_df['entropy p-value']==0, 'tmp entropy p-value'] = 1. / num_permutations
    permutation_df.loc[permutation_df['vest p-value']==0, 'tmp vest p-value'] = 1. / num_permutations
    permutation_df['combined p-value'] = permutation_df[['entropy p-value', 'vest p-value']].apply(mypval.fishers_method, axis=1)
    permutation_df['combined BH q-value'] = mypval.bh_fdr(permutation_df['combined p-value'])
    del permutation_df['tmp vest p-value']
    del permutation_df['tmp entropy p-value']

    # order output
    permutation_df = permutation_df.set_index('gene', drop=False)  # make sure genes are indices
    permutation_df['num recurrent'] = permutation_df['num recurrent'].fillna(-1).astype(int)  # fix dtype isssue
    col_order = ['gene', 'Total Mutations', 'Unmapped to Ref Tx',
                 'num recurrent', 'position entropy',
                 'mean vest score', 'entropy p-value',
                 'vest p-value', 'combined p-value', 'entropy BH q-value',
                 'vest BH q-value', 'combined BH q-value']
    permutation_df = permutation_df.sort_values(by=['combined p-value'])
    return permutation_df[col_order]
def handle_tsg_results(permutation_result):
    """Handles result from TSG results.

    Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    permutation_df = pd.DataFrame(sorted(permutation_result,
                                         key=lambda x: x[2]
                                         if x[2] is not None else 1.1),
                                  columns=[
                                      'gene', 'inactivating count',
                                      'inactivating p-value',
                                      'Total SNV Mutations',
                                      'SNVs Unmapped to Ref Tx'
                                  ])
    permutation_df['inactivating p-value'] = permutation_df[
        'inactivating p-value'].astype('float')
    tmp_df = permutation_df[permutation_df['inactivating p-value'].notnull()]

    # get benjamani hochberg adjusted p-values
    permutation_df['inactivating BH q-value'] = np.nan
    permutation_df.loc[tmp_df.index,
                       'inactivating BH q-value'] = mypval.bh_fdr(
                           tmp_df['inactivating p-value'])

    # sort output by p-value. due to no option to specify NaN order in
    # sort, the df needs to sorted descendingly and then flipped
    permutation_df = permutation_df.sort_values(by='inactivating p-value',
                                                ascending=False)
    permutation_df = permutation_df.reindex(index=permutation_df.index[::-1])

    # order result
    permutation_df = permutation_df.set_index('gene', drop=False)
    col_order = [
        'gene',
        'Total SNV Mutations',
        'SNVs Unmapped to Ref Tx',
        #'Total Frameshift Mutations', 'Frameshifts Unmapped to Ref Tx',
        'inactivating count',
        'inactivating p-value',
        'inactivating BH q-value'
    ]
    return permutation_df[col_order]
def handle_protein_results(permutation_result):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    mycols = [
        'gene', 'num recurrent', 'normalized graph-smoothed position entropy',
        'normalized graph-smoothed position entropy p-value',
        'Total Mutations', 'Unmapped to Ref Tx'
    ]
    permutation_df = pd.DataFrame(permutation_result, columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df[
        'normalized graph-smoothed position entropy BH q-value'] = mypval.bh_fdr(
            permutation_df[
                'normalized graph-smoothed position entropy p-value'])

    # order output
    permutation_df = permutation_df.set_index(
        'gene', drop=False)  # make sure genes are indices
    col_order = [
        'gene', 'Total Mutations', 'Unmapped to Ref Tx', 'num recurrent',
        'normalized graph-smoothed position entropy',
        'normalized graph-smoothed position entropy p-value',
        'normalized graph-smoothed position entropy BH q-value'
    ]
    permutation_df = permutation_df.sort_values(
        by=['normalized graph-smoothed position entropy p-value'])
    return permutation_df[col_order]
def handle_tsg_results(permutation_result):
    """Handles result from TSG results.

    Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    permutation_df = pd.DataFrame(sorted(permutation_result, key=lambda x: x[2] if x[2] is not None else 1.1),
                                  columns=['gene', 'inactivating count', 'inactivating p-value',
                                           'Total SNV Mutations', 'SNVs Unmapped to Ref Tx'])
    permutation_df['inactivating p-value'] = permutation_df['inactivating p-value'].astype('float')
    tmp_df = permutation_df[permutation_df['inactivating p-value'].notnull()]

    # get benjamani hochberg adjusted p-values
    permutation_df['inactivating BH q-value'] = np.nan
    permutation_df.loc[tmp_df.index, 'inactivating BH q-value'] = mypval.bh_fdr(tmp_df['inactivating p-value'])

    # sort output by p-value. due to no option to specify NaN order in
    # sort, the df needs to sorted descendingly and then flipped
    permutation_df = permutation_df.sort_values(by='inactivating p-value', ascending=False)
    permutation_df = permutation_df.reindex(index=permutation_df.index[::-1])

    # order result
    permutation_df = permutation_df.set_index('gene', drop=False)
    col_order  = ['gene', 'Total SNV Mutations', 'SNVs Unmapped to Ref Tx',
                  #'Total Frameshift Mutations', 'Frameshifts Unmapped to Ref Tx',
                  'inactivating count', 'inactivating p-value',
                  'inactivating BH q-value']
    return permutation_df[col_order]
def handle_effect_results(permutation_result):
    """Takes in output from multiprocess_permutation function and converts to
    a better formatted dataframe.

    Parameters
    ----------
    permutation_result : list
        output from multiprocess_permutation

    Returns
    -------
    permutation_df : pd.DataFrame
        formatted output suitable to save
    """
    mycols = [
        'gene', 'num recurrent', 'num inactivating', 'entropy-on-effect',
        'entropy-on-effect p-value', 'Total Mutations', 'Unmapped to Ref Tx'
    ]
    permutation_df = pd.DataFrame(sorted(permutation_result,
                                         key=lambda x: x[4]
                                         if x[4] is not None else 1.1),
                                  columns=mycols)

    # get benjamani hochberg adjusted p-values
    permutation_df['entropy-on-effect BH q-value'] = mypval.bh_fdr(
        permutation_df['entropy-on-effect p-value'])

    # order output
    permutation_df = permutation_df.set_index(
        'gene', drop=False)  # make sure genes are indices
    permutation_df['num recurrent'] = permutation_df['num recurrent'].fillna(
        -1).astype(int)  # fix dtype isssue
    col_order = [
        'gene', 'Total Mutations', 'Unmapped to Ref Tx', 'num recurrent',
        'num inactivating', 'entropy-on-effect', 'entropy-on-effect p-value',
        'entropy-on-effect BH q-value'
    ]
    return permutation_df[col_order]