Exemplo n.º 1
0
def position_permutation(obs_stat,
                         context_counts,
                         context_to_mut,
                         seq_context,
                         gene_seq,
                         gene_vest=None,
                         num_permutations=10000,
                         stop_criteria=100,
                         pseudo_count=0,
                         max_batch=25000):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : tuple, (recur ct, entropy, delta entropy, mean vest)
        tuple containing the observed statistics
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    pseudo_count : int, default: 0
        Pseudo-count for number of recurrent missense mutations for each
        permutation for the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    num_recur_list : list
        list of recurrent mutation counts under the null
    entropy_list : list
        list of position entropy values under the null
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat
    num_sim = 0 # number of simulations
    null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calculate position info
            tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
                                                                                tmp_mut_info['Reference AA'],
                                                                                tmp_mut_info['Somatic AA'],
                                                                                pseudo_count=pseudo_count,
                                                                                is_obs=0)
            # get vest scores
            if gene_vest:
                tmp_vest = scores.compute_vest_stat(gene_vest,
                                                    tmp_mut_info['Reference AA'],
                                                    tmp_mut_info['Somatic AA'],
                                                    tmp_mut_info['Codon Pos'])
            else:
                tmp_vest = 0.0

            # update empirical null distribution counts
            if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1
            if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1

            # stop iterations if reached sufficient precision
            if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
                break
        # update the number of simulations
        num_sim += i+1

    # calculate p-value from empirical null-distribution
    ent_pval = float(null_entropy_ct) / (num_sim)
    vest_pval = float(null_vest_ct) / (num_sim)

    return ent_pval, vest_pval
Exemplo n.º 2
0
def position_permutation(obs_stat,
                         context_counts,
                         context_to_mut,
                         seq_context,
                         gene_seq,
                         gene_vest=None,
                         num_permutations=10000,
                         stop_criteria=100,
                         pseudo_count=0,
                         max_batch=25000):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : tuple, (recur ct, entropy, delta entropy, mean vest)
        tuple containing the observed statistics
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    pseudo_count : int, default: 0
        Pseudo-count for number of recurrent missense mutations for each
        permutation for the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    num_recur_list : list
        list of recurrent mutation counts under the null
    entropy_list : list
        list of position entropy values under the null
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat
    num_sim = 0 # number of simulations
    null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calculate position info
            tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
                                                                                tmp_mut_info['Reference AA'],
                                                                                tmp_mut_info['Somatic AA'],
                                                                                pseudo_count=pseudo_count,
                                                                                is_obs=0)
            # get vest scores
            if gene_vest:
                tmp_vest = scores.compute_vest_stat(gene_vest,
                                                    tmp_mut_info['Reference AA'],
                                                    tmp_mut_info['Somatic AA'],
                                                    tmp_mut_info['Codon Pos'])
            else:
                tmp_vest = 0.0

            # update empirical null distribution counts
            if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1
            if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1

            # stop iterations if reached sufficient precision
            if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
                break
        # update the number of simulations
        num_sim += i+1

    # calculate p-value from empirical null-distribution
    ent_pval = float(null_entropy_ct) / (num_sim)
    vest_pval = float(null_vest_ct) / (num_sim)

    return ent_pval, vest_pval
Exemplo n.º 3
0
def calc_position_p_value(mut_info, unmapped_mut_info, sc, gs, bed, score_dir,
                          num_permutations, stop_thresh, pseudo_count,
                          min_recurrent, min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if score_dir:
            gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir)
            if gene_vest is None:
                logger.warning(
                    'Could not find VEST scores for {0}, skipping . . .'.
                    format(bed.gene_name))
        else:
            gene_vest = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(
            codon_pos,
            ref_aa,
            somatic_aa,
            min_frac=min_fraction,
            min_recur=min_recurrent)
        # get vest score for actual mutations
        vest_score = scores.compute_vest_stat(gene_vest,
                                              aa_mut_info['Reference AA'],
                                              aa_mut_info['Somatic AA'],
                                              aa_mut_info['Codon Pos'])

        # perform simulations to get p-value
        observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score)
        permutation_result = pm.position_permutation(
            observed_stats,
            context_cts,
            context_to_mutations,
            sc,  # sequence context obj
            gs,  # gene sequence obj
            gene_vest,
            num_permutations,
            stop_thresh,
            pseudo_count)
        ent_p_value, vest_p_value = permutation_result
    else:
        num_recurrent = 0
        pos_ent = 0
        vest_score = 0.0
        ent_p_value = 1.0
        vest_p_value = 1.0
    result = [
        bed.gene_name, num_recurrent, pos_ent, vest_score, ent_p_value,
        vest_p_value
    ]
    return result
Exemplo n.º 4
0
def calc_position_p_value(mut_info,
                          unmapped_mut_info,
                          sc,
                          gs,
                          bed,
                          score_dir,
                          num_permutations,
                          stop_thresh,
                          pseudo_count,
                          min_recurrent,
                          min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if score_dir:
            gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir)
            if gene_vest is None:
                logger.warning('Could not find VEST scores for {0}, skipping . . .'.format(bed.gene_name))
        else:
            gene_vest = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos,
                                                                     ref_aa,
                                                                     somatic_aa,
                                                                     min_frac=min_fraction,
                                                                     min_recur=min_recurrent)
        # get vest score for actual mutations
        vest_score = scores.compute_vest_stat(gene_vest,
                                              aa_mut_info['Reference AA'],
                                              aa_mut_info['Somatic AA'],
                                              aa_mut_info['Codon Pos'])

        # perform simulations to get p-value
        observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score)
        permutation_result = pm.position_permutation(observed_stats,
                                                     context_cts,
                                                     context_to_mutations,
                                                     sc,  # sequence context obj
                                                     gs,  # gene sequence obj
                                                     gene_vest,
                                                     num_permutations,
                                                     stop_thresh,
                                                     pseudo_count)
        ent_p_value, vest_p_value = permutation_result
    else:
        num_recurrent = 0
        pos_ent = 0
        vest_score = 0.0
        ent_p_value = 1.0
        vest_p_value = 1.0
    result = [bed.gene_name, num_recurrent, pos_ent, vest_score,
              ent_p_value, vest_p_value]
    return result