def test_ctnnb1_get_aa_mut_info():
    import pysam
    from prob2020.python.gene_sequence import GeneSequence

    # read fasta
    ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa')
    gene_fa = pysam.Fastafile(ctnnb1_fasta)
    gs = GeneSequence(gene_fa, nuc_context=1)

    # read CTNNB1 bed file
    ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed')
    bed_list = [b for b in utils.bed_generator(ctnnb1_bed)]
    gs.set_gene(bed_list[0])

    # specify mutation
    coding_pos = [0]
    somatic_base = ['C']

    # check mutation info
    aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs)
    ref_codon_msg = 'First codon should be start codon ({0})'.format(
        aa_info['Reference Codon'][0])
    assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg
    assert aa_info['Somatic Codon'][
        0] == 'CTG', 'First "A" should be replaced with a "C"'
    assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
Exemplo n.º 2
0
def annotate_maf(coding_pos, somatic_base, gene_seq):
    # make sure numpy array
    coding_pos = np.array(coding_pos)

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []

    # get genome coordinate
    pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x] + 1)
    genome_coord = pos2genome(coding_pos)

    # get info about mutations
    tmp_mut_info = mc.get_aa_mut_info(coding_pos, somatic_base, gene_seq)

    # get string describing variant
    var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                  tmp_mut_info['Somatic AA'],
                                                  tmp_mut_info['Codon Pos'])

    # prepare output
    for k, mysomatic_base in enumerate(somatic_base):
        ######
        # Note: positions are converted to 1-based positions
        # for reporting DNA/Protein change, but internally
        # they are represented as 0-based
        ######
        # format DNA change
        ref_nuc = tmp_mut_info['Reference Nuc'][k]
        nuc_pos = coding_pos[k]
        dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos + 1,
                                           mysomatic_base)

        # format protein change
        ref_aa = tmp_mut_info['Reference AA'][k]
        somatic_aa = tmp_mut_info['Somatic AA'][k]
        codon_pos = tmp_mut_info['Codon Pos'][k]
        codon_pos_1_based = (codon_pos + 1) if codon_pos is not None else None
        protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos_1_based,
                                              somatic_aa)

        # reverse complement if on negative strand
        if strand == '-':
            ref_nuc = utils.rev_comp(ref_nuc)
            mysomatic_base = utils.rev_comp(mysomatic_base)

        # append results
        maf_line = [
            gene_name, strand, chrom, genome_coord[k], genome_coord[k],
            ref_nuc, mysomatic_base, dna_change, protein_change, var_class[k]
        ]
        maf_list.append(maf_line)

    return maf_list
Exemplo n.º 3
0
def annotate_maf(coding_pos, somatic_base, gene_seq):
    # make sure numpy array
    coding_pos = np.array(coding_pos)

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []

    # get genome coordinate
    pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1)
    genome_coord = pos2genome(coding_pos)

    # get info about mutations
    tmp_mut_info = mc.get_aa_mut_info(coding_pos,
                                      somatic_base,
                                      gene_seq)

    # get string describing variant
    var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                  tmp_mut_info['Somatic AA'],
                                                  tmp_mut_info['Codon Pos'])

    # prepare output
    for k, mysomatic_base in enumerate(somatic_base):
        ######
        # Note: positions are converted to 1-based positions
        # for reporting DNA/Protein change, but internally
        # they are represented as 0-based
        ######
        # format DNA change
        ref_nuc = tmp_mut_info['Reference Nuc'][k]
        nuc_pos = coding_pos[k]
        dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos+1, mysomatic_base)

        # format protein change
        ref_aa = tmp_mut_info['Reference AA'][k]
        somatic_aa = tmp_mut_info['Somatic AA'][k]
        codon_pos = tmp_mut_info['Codon Pos'][k]
        codon_pos_1_based = (codon_pos + 1) if codon_pos is not None else None
        protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos_1_based, somatic_aa)

        # reverse complement if on negative strand
        if strand == '-':
            ref_nuc = utils.rev_comp(ref_nuc)
            mysomatic_base = utils.rev_comp(mysomatic_base)

        # append results
        maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k],
                    ref_nuc, mysomatic_base, dna_change,
                    protein_change, var_class[k]]
        maf_list.append(maf_line)

    return maf_list
Exemplo n.º 4
0
def calc_effect_p_value(mut_info, unmapped_mut_info, sc, gs, bed,
                        num_permutations, pseudo_count, min_recurrent,
                        min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # perform permutations
        permutation_result = pm.effect_permutation(
            context_cts,
            context_to_mutations,
            sc,  # sequence context obj
            gs,  # gene sequence obj
            num_permutations,
            pseudo_count)
        effect_entropy_list, recur_list, inactivating_list = permutation_result  # unpack results

        # get effect info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        effect_ent, num_recur, num_inactivating = cutils.calc_effect_info(
            codon_pos,
            ref_aa,
            somatic_aa,
            min_frac=min_fraction,
            min_recur=min_recurrent)

        # calculate permutation p-value
        entropy_num_nulls = sum([
            1 for null_ent in effect_entropy_list
            if null_ent - utils.epsilon <= effect_ent
        ])
        ent_p_value = entropy_num_nulls / float(num_permutations)
    else:
        num_recur = 0
        num_inactivating = 0
        effect_ent = 0
        ent_p_value = 1.0
    result = [
        bed.gene_name, num_recur, num_inactivating, effect_ent, ent_p_value
    ]
    return result
Exemplo n.º 5
0
def calc_effect_p_value(mut_info,
                        unmapped_mut_info,
                        sc,
                        gs,
                        bed,
                        num_permutations,
                        pseudo_count,
                        min_recurrent,
                        min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # perform permutations
        permutation_result = pm.effect_permutation(context_cts,
                                                   context_to_mutations,
                                                   sc,  # sequence context obj
                                                   gs,  # gene sequence obj
                                                   num_permutations,
                                                   pseudo_count)
        effect_entropy_list, recur_list, inactivating_list = permutation_result  # unpack results

        # get effect info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        effect_ent, num_recur, num_inactivating = cutils.calc_effect_info(codon_pos,
                                                                          ref_aa,
                                                                          somatic_aa,
                                                                          min_frac=min_fraction,
                                                                          min_recur=min_recurrent)

        # calculate permutation p-value
        entropy_num_nulls = sum([1 for null_ent in effect_entropy_list
                                 if null_ent-utils.epsilon <= effect_ent])
        ent_p_value = entropy_num_nulls / float(num_permutations)
    else:
        num_recur = 0
        num_inactivating = 0
        effect_ent = 0
        ent_p_value = 1.0
    result = [bed.gene_name, num_recur, num_inactivating,
              effect_ent, ent_p_value]
    return result
Exemplo n.º 6
0
def non_silent_ratio_permutation(context_counts,
                                 context_to_mut,
                                 seq_context,
                                 gene_seq,
                                 num_permutations=10000):
    """Performs null-permutations for non-silent ratio across all genes.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null

    Returns
    -------
    non_silent_count_list : list of tuples
        list of non-silent and silent mutation counts under the null
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

    # determine result of random positions
    non_silent_count_list = []
    for row in tmp_mut_pos:
        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # calc deleterious mutation info
        tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'],
                                                     tmp_mut_info['Somatic AA'],
                                                     tmp_mut_info['Codon Pos'])
        non_silent_count_list.append(tmp_non_silent)
    return non_silent_count_list
Exemplo n.º 7
0
def non_silent_ratio_permutation(context_counts,
                                 context_to_mut,
                                 seq_context,
                                 gene_seq,
                                 num_permutations=10000):
    """Performs null-permutations for non-silent ratio across all genes.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null

    Returns
    -------
    non_silent_count_list : list of tuples
        list of non-silent and silent mutation counts under the null
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

    # determine result of random positions
    non_silent_count_list = []
    for row in tmp_mut_pos:
        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # calc deleterious mutation info
        tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'],
                                                     tmp_mut_info['Somatic AA'],
                                                     tmp_mut_info['Codon Pos'])
        non_silent_count_list.append(tmp_non_silent)
    return non_silent_count_list
def test_ctnnb1_get_aa_mut_info():
    import pysam
    from prob2020.python.gene_sequence import GeneSequence

    # read fasta
    ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa')
    gene_fa = pysam.Fastafile(ctnnb1_fasta)
    gs = GeneSequence(gene_fa, nuc_context=1)

    # read CTNNB1 bed file
    ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed')
    bed_list = [b for b in utils.bed_generator(ctnnb1_bed)]
    gs.set_gene(bed_list[0])

    # specify mutation
    coding_pos = [0]
    somatic_base = ['C']

    # check mutation info
    aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs)
    ref_codon_msg =  'First codon should be start codon ({0})'.format(aa_info['Reference Codon'][0])
    assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg
    assert aa_info['Somatic Codon'][0] == 'CTG', 'First "A" should be replaced with a "C"'
    assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
Exemplo n.º 9
0
def calc_position_p_value(mut_info, unmapped_mut_info, sc, gs, bed, score_dir,
                          num_permutations, stop_thresh, pseudo_count,
                          min_recurrent, min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if score_dir:
            gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir)
            if gene_vest is None:
                logger.warning(
                    'Could not find VEST scores for {0}, skipping . . .'.
                    format(bed.gene_name))
        else:
            gene_vest = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(
            codon_pos,
            ref_aa,
            somatic_aa,
            min_frac=min_fraction,
            min_recur=min_recurrent)
        # get vest score for actual mutations
        vest_score = scores.compute_vest_stat(gene_vest,
                                              aa_mut_info['Reference AA'],
                                              aa_mut_info['Somatic AA'],
                                              aa_mut_info['Codon Pos'])

        # perform simulations to get p-value
        observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score)
        permutation_result = pm.position_permutation(
            observed_stats,
            context_cts,
            context_to_mutations,
            sc,  # sequence context obj
            gs,  # gene sequence obj
            gene_vest,
            num_permutations,
            stop_thresh,
            pseudo_count)
        ent_p_value, vest_p_value = permutation_result
    else:
        num_recurrent = 0
        pos_ent = 0
        vest_score = 0.0
        ent_p_value = 1.0
        vest_p_value = 1.0
    result = [
        bed.gene_name, num_recurrent, pos_ent, vest_score, ent_p_value,
        vest_p_value
    ]
    return result
def singleprocess_permutation(info):
    bed_list, mut_df, opts = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    num_permutations = opts['num_permutations']
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # variables for recording the actual observed number of non-silent
    # vs. silent mutations
    if not opts['by_sample']:
        obs_silent = 0
        obs_non_silent = 0
        obs_nonsense = 0
        obs_loststop = 0
        obs_splice_site = 0
        obs_loststart = 0
        obs_missense = 0
        obs_vest = 0
        obs_mga_entropy = 0
    else:
        uniq_samp = mut_df['Tumor_Sample'].unique()
        obs_df = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))),
                              index=uniq_samp,
                              columns=cols)

    # go through each gene to permform simulation
    if opts['score_dir']:
        result = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    else:
        result = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    for bed in bed_list:
        # compute context counts and somatic bases for each context
        gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if context_to_mutations:
            ## get information about observed non-silent counts
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(
                mutations_df['Coding Position'],
                mutations_df['Tumor_Allele'].tolist(), gs)
            # update the observed count
            if not opts['by_sample']:
                # calc deleterious mutation info
                #tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'],
                #tmp_mut_info['Somatic AA'],
                #tmp_mut_info['Codon Pos'])
                # calc mutation info summarizing observed mutations
                tmp_result = cutils.calc_summary_info(
                    tmp_mut_info['Reference AA'],
                    tmp_mut_info['Somatic AA'],
                    tmp_mut_info['Codon Pos'],
                    bed.gene_name,
                    opts['score_dir'],
                    #min_frac=opts['fraction'],
                    min_frac=0.0,
                    #min_recur=opts['recurrent']
                    min_recur=3)
                obs_non_silent += tmp_result[0]
                obs_silent += tmp_result[1]
                obs_nonsense += tmp_result[2]
                obs_loststop += tmp_result[3]
                obs_splice_site += tmp_result[4]
                obs_loststart += tmp_result[5]
                obs_missense += tmp_result[6]
                if opts['score_dir']:
                    obs_vest += tmp_result[-2]
                    obs_mga_entropy += tmp_result[-3]
            else:
                for tsamp in mutations_df['Tumor_Sample'].unique():
                    ixs = np.where(mutations_df['Tumor_Sample'] == tsamp)[0]
                    ref_aa = [
                        r for i, r in enumerate(tmp_mut_info['Reference AA'])
                        if i in ixs
                    ]
                    somatic_aa = [
                        s for i, s in enumerate(tmp_mut_info['Somatic AA'])
                        if i in ixs
                    ]
                    codon_pos = [
                        c for i, c in enumerate(tmp_mut_info['Codon Pos'])
                        if i in ixs
                    ]
                    #tmp_non_silent = cutils.calc_non_silent_info(ref_aa,
                    #somatic_aa,
                    #codon_pos)
                    # get summary info
                    tmp_result = cutils.calc_summary_info(ref_aa,
                                                          somatic_aa,
                                                          codon_pos,
                                                          bed.gene_name,
                                                          opts['score_dir'],
                                                          min_frac=0.0,
                                                          min_recur=3)
                    if opts['score_dir']:
                        tmp_result.pop(-4)
                        tmp_result.pop(-4)
                        tmp_result.pop(-1)
                    # update df
                    #obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_non_silent)
                    obs_df.loc[
                        tsamp, :] = obs_df.loc[tsamp, :] + np.array(tmp_result)

            ## Do permutations
            # calculate non silent count
            #tmp_result = pm.non_silent_ratio_permutation(context_cts,
            #context_to_mutations,
            #sc,  # sequence context obj
            #gs,  # gene sequence obj
            #num_permutations)
            tmp_result = pm.summary_permutation(
                context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                opts['score_dir'],
                num_permutations)
        else:
            if opts['score_dir']:
                tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                              for k in range(num_permutations)]
            else:
                tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                              for k in range(num_permutations)]

        # increment the non-silent/silent counts for each permutation
        offset = 3
        for j in range(num_permutations):
            result[j][0] += tmp_result[j][0 + offset]
            result[j][1] += tmp_result[j][1 + offset]
            result[j][2] += tmp_result[j][2 + offset]
            result[j][3] += tmp_result[j][3 + offset]
            result[j][4] += tmp_result[j][4 + offset]
            result[j][5] += tmp_result[j][5 + offset]
            result[j][6] += tmp_result[j][6 + offset]
            if opts['score_dir']:
                result[j][7] += tmp_result[j][9 + offset]
                result[j][8] += tmp_result[j][10 + offset]

    gene_fa.close()
    if not opts['by_sample']:
        obs_result = [
            obs_non_silent, obs_silent, obs_nonsense, obs_loststop,
            obs_splice_site, obs_loststart, obs_missense
        ]
        if opts['score_dir']:
            obs_result.extend([obs_mga_entropy, obs_vest])
    else:
        obs_result = obs_df
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result, obs_result
Exemplo n.º 11
0
def calc_hotmaps_p_value(mut_info,
                         unmapped_mut_info,
                         sc,
                         gs,
                         bed,
                         window_size,
                         num_permutations,
                         stop_thresh,
                         report_index=False,
                         null_save_path=None):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        pos_ct, window_sum_dict = utils.calc_windowed_sum(codon_pos,
                                                          ref_aa,
                                                          somatic_aa,
                                                          window_size)

        # no missense mutations
        if not pos_ct:
            return []

        # in case the index in the original mutation data frame is needed
        if report_index:
            mut_info['Codon Pos'] = aa_mut_info['Codon Pos']
            pos2ix = mut_info.groupby('Codon Pos').groups

        # perform simulations to get p-value
        pval_dict = pm.hotmaps_permutation(window_sum_dict,
                                           context_cts,
                                           context_to_mutations,
                                           sc,  # sequence context obj
                                           gs,  # gene sequence obj
                                           window_size,
                                           num_permutations,
                                           stop_thresh,
                                           null_save_path=null_save_path)

        # prepare output
        # NOTE: internally codon positions start at 0, so add 1 for the output
        # to the user.
        if not report_index:
            result = [[bed.gene_name, mywin, k+1, pos_ct[k], window_sum_dict[mywin][k], pval_dict[mywin][k]]
                      for mywin in window_sum_dict
                      for k in window_sum_dict[mywin]]
        else:
            result = [[bed.gene_name, mywin, k+1, pos2ix[k][0], pos_ct[k], window_sum_dict[mywin][k], pval_dict[mywin][k]]
                      for mywin in window_sum_dict
                      for k in window_sum_dict[mywin]]

    else:
        result = []
    return result
Exemplo n.º 12
0
def calc_deleterious_p_value(mut_info,
                             unmapped_mut_info,
                             sc,
                             gs,
                             bed,
                             num_permutations,
                             stop_thresh,
                             del_threshold,
                             pseudo_count,
                             seed=None):
    """Calculates the p-value for the number of inactivating SNV mutations.

    Calculates p-value based on how many simulations exceed the observed value.

    Parameters
    ----------
    mut_info : dict
        contains codon and amino acid residue information for mutations mappable
        to provided reference tx.
    unmapped_mut_info : dict
        contains codon/amino acid residue info for mutations that are NOT mappable
        to provided reference tx.
    fs_ct : int
        number of frameshifts for gene
    prob_inactive : float
        proportion of inactivating mutations out of total over all genes
    sc : SequenceContext
        object contains the nucleotide contexts for a gene such that new random
        positions can be obtained while respecting nucleotide context.
    gs : GeneSequence
        contains gene sequence
    bed : BedLine
        just used to return gene name
    num_permutations : int
        number of permutations to perform to estimate p-value. more permutations
        means more precision on the p-value.
    seed : int (Default: None)
        seed number to random number generator (None to be randomly set)
    """
    #prng = np.random.RandomState(seed)
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get deleterious info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        num_del = cutils.calc_deleterious_info(ref_aa, somatic_aa, codon_pos)
        #num_del = fs_ct + num_snv_del

        # skip permutation test if number of deleterious mutations is not at
        # least meet some user-specified threshold
        if num_del >= del_threshold:
            # perform permutations
            del_p_value = pm.deleterious_permutation(num_del,
                                                     context_cts,
                                                     context_to_mutations,
                                                     sc,  # sequence context obj
                                                     gs,  # gene sequence obj
                                                     num_permutations,
                                                     stop_thresh,
                                                     pseudo_count)
        else:
            del_p_value = None
    else:
        num_del = 0
        del_p_value = None

    result = [bed.gene_name, num_del, del_p_value]
    return result
Exemplo n.º 13
0
def effect_permutation(context_counts,
                       context_to_mut,
                       seq_context,
                       gene_seq,
                       num_permutations=10000,
                       pseudo_count=0):
    """Performs null-permutations for effect-based mutation statistics
    in a single gene.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    pseudo_count : int, default: 0
        Pseudo-count for number of recurrent missense mutations for each
        permutation for the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    effect_entropy_list : list
        list of entropy of effect values under the null
    recur_list : list
        number of recurrent missense mutations
    inactivating_list : list
        number of inactivating mutations
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

    # calculate position-based statistics as a result of random positions
    effect_entropy_list, recur_list, inactivating_list = [], [], []
    for row in tmp_mut_pos:
        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # calculate position info
        tmp_entropy, tmp_recur, tmp_inactivating = cutils.calc_effect_info(tmp_mut_info['Codon Pos'],
                                                                           tmp_mut_info['Reference AA'],
                                                                           tmp_mut_info['Somatic AA'],
                                                                           pseudo_count=pseudo_count,
                                                                           is_obs=0)
        effect_entropy_list.append(tmp_entropy)
        recur_list.append(tmp_recur)
        inactivating_list.append(tmp_inactivating)

    return effect_entropy_list, recur_list, inactivating_list
Exemplo n.º 14
0
def hotmaps_permutation(obs_stat,
                        context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        window,
                        num_permutations=10000,
                        stop_criteria=100,
                        max_batch=25000,
                        null_save_path=None):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : dict
        dictionary mapping codons to the sum of mutations in a window
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    window : int
        Number of codons to the left/right of a mutated position to consider
        in the window
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    max_batch : int
        maximum number of whole gene simulations to do at once.
        For large number of simulations holding a matrix of M x N,
        where M is the number of mutations and N is the number of simulations,
        can get quite large.
    null_save_path : str or None
        File path to save null distribution. If None, don't save it.

    Returns
    -------
    pvals : dict
        Maps mutated codon position to the calculated p-value
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    # figure out which position has highest value
    max_key = {w: max(obs_stat[w], key=(lambda key: obs_stat[w][key]))
               for w in window}

    # setup null dist counts
    null_cts = {w: {k: 0 for k in obs_stat[w]}
                for w in window }

    # empirical null distribution (saved if file path provided)
    empirical_null = {w: {} for w in window}

    num_sim = 0 # number of simulations
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria)
                      for w in window]
        if all(stop_flag):
            break
        #if null_cts[max_key] >= stop_criteria:
            #break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calculate position info
            tmp_pos, tmp_sim = utils.calc_windowed_sum(tmp_mut_info['Codon Pos'],
                                                 tmp_mut_info['Reference AA'],
                                                 tmp_mut_info['Somatic AA'],
                                                 window)

            # update the counts when the empirical null passes the observed
            for tmp_w in tmp_sim:
                for tmp_key in tmp_sim[tmp_w]:
                    # get mutation count for simulation
                    val = tmp_sim[tmp_w][tmp_key]

                    # add to empirical null distribution
                    empirical_null[tmp_w].setdefault(val, 0)
                    empirical_null[tmp_w][val] += 1

                    # update counts used for p-value
                    for key in null_cts[tmp_w]:
                        if val >= obs_stat[tmp_w][key]:
                            null_cts[tmp_w][key] += 1

            # update the number of simulations
            num_sim += len(tmp_pos)

            # stop iterations if reached sufficient precision
            stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria)
                         for w in window]
            if all(stop_flag):
                break

    # calculate p-value from empirical null-distribution
    pvals = {w: {k: float(null_cts[w][k]) / (num_sim) for k in obs_stat[w]}
             for w in window}

    # save empirical distribution
    if null_save_path:
        for w in window:
            # create null distribution
            output = [['mutation_count', 'p-value']]
            sorted_cts = sorted(empirical_null[w].keys())
            tmp_sum = 0
            for i in range(len(sorted_cts)):
                tmp_sum += empirical_null[w][sorted_cts[-(i+1)]]
                tmp_pval = tmp_sum / float(num_sim)
                output.append([sorted_cts[-(i+1)], tmp_pval])
            # save output
            with open(null_save_path.format(w), 'w') as handle:
                mywriter = csv.writer(handle, delimiter='\t', lineterminator='\n')
                mywriter.writerows(output)

    return pvals
Exemplo n.º 15
0
def calc_position_p_value(mut_info,
                          unmapped_mut_info,
                          sc,
                          gs,
                          bed,
                          score_dir,
                          num_permutations,
                          stop_thresh,
                          pseudo_count,
                          min_recurrent,
                          min_fraction):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if score_dir:
            gene_vest = scores.read_vest_pickle(bed.gene_name, score_dir)
            if gene_vest is None:
                logger.warning('Could not find VEST scores for {0}, skipping . . .'.format(bed.gene_name))
        else:
            gene_vest = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos,
                                                                     ref_aa,
                                                                     somatic_aa,
                                                                     min_frac=min_fraction,
                                                                     min_recur=min_recurrent)
        # get vest score for actual mutations
        vest_score = scores.compute_vest_stat(gene_vest,
                                              aa_mut_info['Reference AA'],
                                              aa_mut_info['Somatic AA'],
                                              aa_mut_info['Codon Pos'])

        # perform simulations to get p-value
        observed_stats = (num_recurrent, pos_ent, delta_pos_ent, vest_score)
        permutation_result = pm.position_permutation(observed_stats,
                                                     context_cts,
                                                     context_to_mutations,
                                                     sc,  # sequence context obj
                                                     gs,  # gene sequence obj
                                                     gene_vest,
                                                     num_permutations,
                                                     stop_thresh,
                                                     pseudo_count)
        ent_p_value, vest_p_value = permutation_result
    else:
        num_recurrent = 0
        pos_ent = 0
        vest_score = 0.0
        ent_p_value = 1.0
        vest_p_value = 1.0
    result = [bed.gene_name, num_recurrent, pos_ent, vest_score,
              ent_p_value, vest_p_value]
    return result
Exemplo n.º 16
0
def maf_permutation(context_counts,
                    context_to_mut,
                    seq_context,
                    gene_seq,
                    num_permutations=10000,
                    drop_silent=False):
    """Performs null-permutations across all genes and records the results in
    a format like a MAF file. This could be useful for examining the null
    permutations because the alternative approaches always summarize the results.
    With the simulated null-permutations, novel metrics can be applied to create
    an empirical null-distribution.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    drop_silent : bool, default=False
        Flage on whether to drop all silent mutations. Some data sources
        do not report silent mutations, and the simulations should match this.

    Returns
    -------
    maf_list : list of tuples
        list of null mutations with mutation info in a MAF like format
    """
    mycontexts = context_counts.index.tolist()
    somatic_base, base_context = zip(*[(base, one_context)
                                       for one_context in mycontexts
                                       for base in context_to_mut[one_context]])

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []
    for row in tmp_mut_pos:
        # get genome coordinate
        pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1)
        genome_coord = pos2genome(row)

        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # get string describing variant
        var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                      tmp_mut_info['Somatic AA'],
                                                      tmp_mut_info['Codon Pos'])

        # prepare output
        for k, mysomatic_base in enumerate(somatic_base):
            # format DNA change
            ref_nuc = tmp_mut_info['Reference Nuc'][k]
            nuc_pos = row[k]
            dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos, mysomatic_base)

            # format protein change
            ref_aa = tmp_mut_info['Reference AA'][k]
            somatic_aa = tmp_mut_info['Somatic AA'][k]
            codon_pos = tmp_mut_info['Codon Pos'][k]
            protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos, somatic_aa)

            # reverse complement if on negative strand
            if strand == '-':
                ref_nuc = utils.rev_comp(ref_nuc)
                mysomatic_base = utils.rev_comp(mysomatic_base)

            # append results
            if drop_silent and var_class[k].decode() == 'Silent': continue
            maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k],
                        ref_nuc, mysomatic_base, base_context[k], dna_change,
                        protein_change, var_class[k].decode()]
            maf_list.append(maf_line)

    return maf_list
Exemplo n.º 17
0
def summary_permutation(context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        score_dir,
                        num_permutations=10000,
                        min_frac=0.0,
                        min_recur=2,
                        drop_silent=False):
    """Performs null-permutations and summarizes the results as features over
    the gene.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    drop_silent : bool, default=False
        Flage on whether to drop all silent mutations. Some data sources
        do not report silent mutations, and the simulations should match this.

    Returns
    -------
    summary_info_list : list of lists
        list of non-silent and silent mutation counts under the null along
        with information on recurrent missense counts and missense positional
        entropy.
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

    # determine result of random positions
    gene_name = gene_seq.bed.gene_name
    gene_len = gene_seq.bed.cds_len
    summary_info_list = []
    for i, row in enumerate(tmp_mut_pos):
        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # Get all metrics summarizing each gene
        tmp_summary = cutils.calc_summary_info(tmp_mut_info['Reference AA'],
                                               tmp_mut_info['Somatic AA'],
                                               tmp_mut_info['Codon Pos'],
                                               gene_name,
                                               score_dir,
                                               min_frac=min_frac,
                                               min_recur=min_recur)

        # drop silent if needed
        if drop_silent:
            # silent mutation count is index 1
            tmp_summary[1] = 0

        # limit the precision of floats
        #pos_ent = tmp_summary[-1]
        #tmp_summary[-1] = '{0:.5f}'.format(pos_ent)

        summary_info_list.append([gene_name, i+1, gene_len]+tmp_summary)
    return summary_info_list
Exemplo n.º 18
0
def effect_permutation(context_counts,
                       context_to_mut,
                       seq_context,
                       gene_seq,
                       num_permutations=10000,
                       pseudo_count=0):
    """Performs null-permutations for effect-based mutation statistics
    in a single gene.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    pseudo_count : int, default: 0
        Pseudo-count for number of recurrent missense mutations for each
        permutation for the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    effect_entropy_list : list
        list of entropy of effect values under the null
    recur_list : list
        number of recurrent missense mutations
    inactivating_list : list
        number of inactivating mutations
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

    # calculate position-based statistics as a result of random positions
    effect_entropy_list, recur_list, inactivating_list = [], [], []
    for row in tmp_mut_pos:
        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # calculate position info
        tmp_entropy, tmp_recur, tmp_inactivating = cutils.calc_effect_info(tmp_mut_info['Codon Pos'],
                                                                           tmp_mut_info['Reference AA'],
                                                                           tmp_mut_info['Somatic AA'],
                                                                           pseudo_count=pseudo_count,
                                                                           is_obs=0)
        effect_entropy_list.append(tmp_entropy)
        recur_list.append(tmp_recur)
        inactivating_list.append(tmp_inactivating)

    return effect_entropy_list, recur_list, inactivating_list
Exemplo n.º 19
0
def protein_permutation(graph_score,
                        num_codons_obs,
                        context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        gene_graph,
                        num_permutations=10000,
                        stop_criteria=100,
                        pseudo_count=0):
    """Performs null-simulations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    graph_score : float
        clustering score for observed data
    num_codons_obs : int
        number of codons with missense mutation in observed data
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.

    Returns
    -------
    protein_pval : float
        p-value for clustering in neighbor graph constructure from protein
        structures
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

    # calculate position-based statistics as a result of random positions
    null_graph_entropy_ct = 0
    coverage_list = []
    num_mut_list = []
    graph_entropy_list = []
    for i, row in enumerate(tmp_mut_pos):
        # calculate the expected value of the relative increase in coverage
        if i == stop_criteria-1:
            rel_inc = [coverage_list[k] / float(num_mut_list[k])
                       for k in range(stop_criteria-1)
                       if coverage_list[k]]
            exp_rel_inc = np.mean(rel_inc)

            # calculate observed statistic
            if num_codons_obs:
                obs_stat = graph_score / np.log2(exp_rel_inc*num_codons_obs)
            else:
                obs_stat = 1.0

            # calculate statistics for simulated data
            sim_stat_list = [ent / np.log2(exp_rel_inc*num_mut_list[l])
                             for l, ent in enumerate(graph_entropy_list)]
            null_graph_entropy_ct = len([s for s in sim_stat_list
                                         if s-utils.epsilon <= obs_stat])

        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # calculate position info
        tmp_tuple = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
                                         tmp_mut_info['Reference AA'],
                                         tmp_mut_info['Somatic AA'],
                                         pseudo_count=pseudo_count,
                                         is_obs=0)
        _, _, _, tmp_pos_ct = tmp_tuple

        # record num of mut codons
        if i < stop_criteria-1:
            tmp_num_mut_codons = len(tmp_pos_ct)
            num_mut_list.append(tmp_num_mut_codons)

        # get entropy on graph-smoothed probability distribution
        tmp_graph_entropy, tmp_coverage = scores.compute_ng_stat(gene_graph, tmp_pos_ct)

        # record the "coverage" in the graph
        if i < stop_criteria-1:
            coverage_list.append(tmp_coverage)
            graph_entropy_list.append(tmp_graph_entropy)

        # update empirical null distribution counts
        if i >= stop_criteria:
            #if tmp_graph_entropy-utils.epsilon <= graph_score:
            if tmp_num_mut_codons:
                sim_stat = tmp_graph_entropy / np.log2(exp_rel_inc*tmp_num_mut_codons)
            else:
                sim_stat = 1.0

            # add count
            if sim_stat-utils.epsilon <= obs_stat:
                null_graph_entropy_ct += 1

        # stop iterations if reached sufficient precision
        if null_graph_entropy_ct >= stop_criteria:
            break

    # calculate p-value from empirical null-distribution
    protein_pval = float(null_graph_entropy_ct) / (i+1)

    return protein_pval, obs_stat
Exemplo n.º 20
0
def hotmaps_permutation(obs_stat,
                        context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        window,
                        num_permutations=10000,
                        stop_criteria=100,
                        max_batch=25000,
                        null_save_path=None):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : dict
        dictionary mapping codons to the sum of mutations in a window
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    window : int
        Number of codons to the left/right of a mutated position to consider
        in the window
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    max_batch : int
        maximum number of whole gene simulations to do at once.
        For large number of simulations holding a matrix of M x N,
        where M is the number of mutations and N is the number of simulations,
        can get quite large.
    null_save_path : str or None
        File path to save null distribution. If None, don't save it.

    Returns
    -------
    pvals : dict
        Maps mutated codon position to the calculated p-value
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    # figure out which position has highest value
    max_key = {w: max(obs_stat[w], key=(lambda key: obs_stat[w][key]))
               for w in window}

    # setup null dist counts
    null_cts = {w: {k: 0 for k in obs_stat[w]}
                for w in window }

    # empirical null distribution (saved if file path provided)
    empirical_null = {w: {} for w in window}

    num_sim = 0 # number of simulations
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        # stop iterations if reached sufficient precision
        stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria)
                      for w in window]
        if all(stop_flag):
            break
        #if null_cts[max_key] >= stop_criteria:
            #break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calculate position info
            tmp_pos, tmp_sim = utils.calc_windowed_sum(tmp_mut_info['Codon Pos'],
                                                 tmp_mut_info['Reference AA'],
                                                 tmp_mut_info['Somatic AA'],
                                                 window)

            # update the counts when the empirical null passes the observed
            for tmp_w in tmp_sim:
                for tmp_key in tmp_sim[tmp_w]:
                    # get mutation count for simulation
                    val = tmp_sim[tmp_w][tmp_key]

                    # add to empirical null distribution
                    empirical_null[tmp_w].setdefault(val, 0)
                    empirical_null[tmp_w][val] += 1

                    # update counts used for p-value
                    for key in null_cts[tmp_w]:
                        if val >= obs_stat[tmp_w][key]:
                            null_cts[tmp_w][key] += 1

            # update the number of simulations
            num_sim += len(tmp_pos)

            # stop iterations if reached sufficient precision
            stop_flag = [(null_cts[w][max_key[w]]>=stop_criteria)
                         for w in window]
            if all(stop_flag):
                break

    # calculate p-value from empirical null-distribution
    pvals = {w: {k: float(null_cts[w][k]) / (num_sim) for k in obs_stat[w]}
             for w in window}

    # save empirical distribution
    if null_save_path:
        for w in window:
            # create null distribution
            output = [['mutation_count', 'p-value']]
            sorted_cts = sorted(empirical_null[w].keys())
            tmp_sum = 0
            for i in range(len(sorted_cts)):
                tmp_sum += empirical_null[w][sorted_cts[-(i+1)]]
                tmp_pval = tmp_sum / float(num_sim)
                output.append([sorted_cts[-(i+1)], tmp_pval])
            # save output
            with open(null_save_path.format(w), 'w') as handle:
                mywriter = csv.writer(handle, delimiter='\t', lineterminator='\n')
                mywriter.writerows(output)

    return pvals
def singleprocess_permutation(info):
    bed_list, mut_df, opts = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    num_permutations = opts['num_permutations']
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # variables for recording the actual observed number of non-silent
    # vs. silent mutations
    if not opts['by_sample']:
        obs_silent = 0
        obs_non_silent = 0
        obs_nonsense = 0
        obs_loststop = 0
        obs_splice_site = 0
        obs_loststart = 0
        obs_missense = 0
        obs_vest = 0
        obs_mga_entropy = 0
    else:
        uniq_samp = mut_df['Tumor_Sample'].unique()
        obs_df = pd.DataFrame(np.zeros((len(uniq_samp), len(cols))),
                              index=uniq_samp, columns=cols)

    # go through each gene to permform simulation
    if opts['score_dir']:
        result = [[0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    else:
        result = [[0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
    for bed in bed_list:
        # compute context counts and somatic bases for each context
        gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if context_to_mutations:
            ## get information about observed non-silent counts
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(mutations_df['Coding Position'],
                                              mutations_df['Tumor_Allele'].tolist(),
                                              gs)
            # update the observed count
            if not opts['by_sample']:
                # calc deleterious mutation info
                #tmp_non_silent = cutils.calc_non_silent_info(tmp_mut_info['Reference AA'],
                                                             #tmp_mut_info['Somatic AA'],
                                                             #tmp_mut_info['Codon Pos'])
                # calc mutation info summarizing observed mutations
                tmp_result = cutils.calc_summary_info(tmp_mut_info['Reference AA'],
                                                      tmp_mut_info['Somatic AA'],
                                                      tmp_mut_info['Codon Pos'],
                                                      bed.gene_name,
                                                      opts['score_dir'],
                                                      #min_frac=opts['fraction'],
                                                      min_frac=0.0,
                                                      #min_recur=opts['recurrent']
                                                      min_recur=3
                                                      )
                obs_non_silent += tmp_result[0]
                obs_silent += tmp_result[1]
                obs_nonsense += tmp_result[2]
                obs_loststop += tmp_result[3]
                obs_splice_site += tmp_result[4]
                obs_loststart += tmp_result[5]
                obs_missense += tmp_result[6]
                if opts['score_dir']:
                    obs_vest += tmp_result[-2]
                    obs_mga_entropy += tmp_result[-3]
            else:
                for tsamp in mutations_df['Tumor_Sample'].unique():
                    ixs = np.where(mutations_df['Tumor_Sample']==tsamp)[0]
                    ref_aa = [r for i, r in enumerate(tmp_mut_info['Reference AA']) if i in ixs]
                    somatic_aa = [s for i, s in enumerate(tmp_mut_info['Somatic AA']) if i in ixs]
                    codon_pos = [c for i, c in enumerate(tmp_mut_info['Codon Pos']) if i in ixs]
                    #tmp_non_silent = cutils.calc_non_silent_info(ref_aa,
                                                                 #somatic_aa,
                                                                 #codon_pos)
                    # get summary info
                    tmp_result = cutils.calc_summary_info(ref_aa,
                                                          somatic_aa,
                                                          codon_pos,
                                                          bed.gene_name,
                                                          opts['score_dir'],
                                                          min_frac=0.0,
                                                          min_recur=3)
                    if opts['score_dir']:
                        tmp_result.pop(-4)
                        tmp_result.pop(-4)
                        tmp_result.pop(-1)
                    # update df
                    #obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_non_silent)
                    obs_df.loc[tsamp,:] = obs_df.loc[tsamp,:] + np.array(tmp_result)

            ## Do permutations
            # calculate non silent count
            #tmp_result = pm.non_silent_ratio_permutation(context_cts,
                                                         #context_to_mutations,
                                                         #sc,  # sequence context obj
                                                         #gs,  # gene sequence obj
                                                         #num_permutations)
            tmp_result = pm.summary_permutation(context_cts,
                                                context_to_mutations,
                                                sc,  # sequence context obj
                                                gs,  # gene sequence obj
                                                opts['score_dir'],
                                                num_permutations)
        else:
            if opts['score_dir']:
                tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]
            else:
                tmp_result = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for k in range(num_permutations)]

        # increment the non-silent/silent counts for each permutation
        offset = 3
        for j in range(num_permutations):
            result[j][0] += tmp_result[j][0+offset]
            result[j][1] += tmp_result[j][1+offset]
            result[j][2] += tmp_result[j][2+offset]
            result[j][3] += tmp_result[j][3+offset]
            result[j][4] += tmp_result[j][4+offset]
            result[j][5] += tmp_result[j][5+offset]
            result[j][6] += tmp_result[j][6+offset]
            if opts['score_dir']:
                result[j][7] += tmp_result[j][9+offset]
                result[j][8] += tmp_result[j][10+offset]

    gene_fa.close()
    if not opts['by_sample']:
        obs_result = [obs_non_silent, obs_silent, obs_nonsense,
                      obs_loststop, obs_splice_site, obs_loststart, obs_missense]
        if opts['score_dir']:
            obs_result.extend([obs_mga_entropy, obs_vest])
    else:
        obs_result = obs_df
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result, obs_result
Exemplo n.º 22
0
def calc_hotmaps_p_value(mut_info,
                         unmapped_mut_info,
                         sc,
                         gs,
                         bed,
                         window_size,
                         num_permutations,
                         stop_thresh,
                         report_index=False):
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        pos_ct, window_sum_dict = utils.calc_windowed_sum(
            codon_pos, ref_aa, somatic_aa, window_size)

        # no missense mutations
        if not pos_ct:
            return []

        # in case the index in the original mutation data frame is needed
        if report_index:
            mut_info['Codon Pos'] = aa_mut_info['Codon Pos']
            pos2ix = mut_info.groupby('Codon Pos').groups

        # perform simulations to get p-value
        pval_dict = pm.hotmaps_permutation(
            window_sum_dict,
            context_cts,
            context_to_mutations,
            sc,  # sequence context obj
            gs,  # gene sequence obj
            window_size,
            num_permutations,
            stop_thresh)

        # prepare output
        # NOTE: internally codon positions start at 0, so add 1 for the output
        # to the user.
        if not report_index:
            result = [[
                bed.gene_name, k + 1, pos_ct[k], window_sum_dict[k],
                pval_dict[k]
            ] for k in window_sum_dict]
        else:
            result = [[
                bed.gene_name, k + 1, pos2ix[k][0], pos_ct[k],
                window_sum_dict[k], pval_dict[k]
            ] for k in window_sum_dict]

    else:
        result = []
    return result
Exemplo n.º 23
0
def calc_protein_p_value(mut_info, unmapped_mut_info, sc, gs, bed, graph_dir,
                         num_permutations, stop_thresh, min_recurrent,
                         min_fraction):
    """Computes the p-value for clustering on a neighbor graph composed
    of codons connected with edges if they are spatially near in 3D protein
    structure.

    Parameters
    ----------


    Returns
    -------

    """
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if graph_dir:
            gene_graph = scores.read_neighbor_graph_pickle(
                bed.gene_name, graph_dir)
            if gene_graph is None:
                logger.warning(
                    'Could not find neighbor graph for {0}, skipping . . .'.
                    format(bed.gene_name))
        else:
            gene_graph = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(
            codon_pos,
            ref_aa,
            somatic_aa,
            min_frac=min_fraction,
            min_recur=min_recurrent)
        try:
            # get vest score for actual mutations
            graph_score, coverage = scores.compute_ng_stat(gene_graph, pos_ct)

            # perform simulations to get p-value
            protein_p_value, norm_graph_score = pm.protein_permutation(
                graph_score,
                len(pos_ct),
                context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                gene_graph,
                num_permutations,
                stop_thresh)
        except Exception as err:
            exc_info = sys.exc_info()
            norm_graph_score = 0.0
            protein_p_value = 1.0
            logger.warning('Codon numbering problem with ' + bed.gene_name)

    else:
        norm_graph_score = 0.0
        protein_p_value = 1.0
        num_recurrent = 0

    result = [bed.gene_name, num_recurrent, norm_graph_score, protein_p_value]
    return result
Exemplo n.º 24
0
def summary_permutation(context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        score_dir,
                        num_permutations=10000,
                        min_frac=0.0,
                        min_recur=2,
                        drop_silent=False):
    """Performs null-permutations and summarizes the results as features over
    the gene.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    drop_silent : bool, default=False
        Flage on whether to drop all silent mutations. Some data sources
        do not report silent mutations, and the simulations should match this.

    Returns
    -------
    summary_info_list : list of lists
        list of non-silent and silent mutation counts under the null along
        with information on recurrent missense counts and missense positional
        entropy.
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

    # determine result of random positions
    gene_name = gene_seq.bed.gene_name
    gene_len = gene_seq.bed.cds_len
    summary_info_list = []
    for i, row in enumerate(tmp_mut_pos):
        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # Get all metrics summarizing each gene
        tmp_summary = cutils.calc_summary_info(tmp_mut_info['Reference AA'],
                                               tmp_mut_info['Somatic AA'],
                                               tmp_mut_info['Codon Pos'],
                                               gene_name,
                                               score_dir,
                                               min_frac=min_frac,
                                               min_recur=min_recur)

        # drop silent if needed
        if drop_silent:
            # silent mutation count is index 1
            tmp_summary[1] = 0

        # limit the precision of floats
        #pos_ent = tmp_summary[-1]
        #tmp_summary[-1] = '{0:.5f}'.format(pos_ent)

        summary_info_list.append([gene_name, i+1, gene_len]+tmp_summary)
    return summary_info_list
Exemplo n.º 25
0
def calc_deleterious_p_value(mut_info,
                             unmapped_mut_info,
                             sc,
                             gs,
                             bed,
                             num_permutations,
                             stop_thresh,
                             del_threshold,
                             pseudo_count,
                             seed=None):
    """Calculates the p-value for the number of inactivating SNV mutations.

    Calculates p-value based on how many simulations exceed the observed value.

    Parameters
    ----------
    mut_info : dict
        contains codon and amino acid residue information for mutations mappable
        to provided reference tx.
    unmapped_mut_info : dict
        contains codon/amino acid residue info for mutations that are NOT mappable
        to provided reference tx.
    fs_ct : int
        number of frameshifts for gene
    prob_inactive : float
        proportion of inactivating mutations out of total over all genes
    sc : SequenceContext
        object contains the nucleotide contexts for a gene such that new random
        positions can be obtained while respecting nucleotide context.
    gs : GeneSequence
        contains gene sequence
    bed : BedLine
        just used to return gene name
    num_permutations : int
        number of permutations to perform to estimate p-value. more permutations
        means more precision on the p-value.
    seed : int (Default: None)
        seed number to random number generator (None to be randomly set)
    """
    #prng = np.random.RandomState(seed)
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(
            lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict(
            (name, group['Tumor_Allele'])
            for name, group in tmp_df.groupby('Context'))

        # get deleterious info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(), gs)
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        num_del = cutils.calc_deleterious_info(ref_aa, somatic_aa, codon_pos)
        #num_del = fs_ct + num_snv_del

        # skip permutation test if number of deleterious mutations is not at
        # least meet some user-specified threshold
        if num_del >= del_threshold:
            # perform permutations
            del_p_value = pm.deleterious_permutation(
                num_del,
                context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                num_permutations,
                stop_thresh,
                pseudo_count)
        else:
            del_p_value = None
    else:
        num_del = 0
        del_p_value = None

    result = [bed.gene_name, num_del, del_p_value]
    return result
Exemplo n.º 26
0
def deleterious_permutation(obs_del,
                            context_counts,
                            context_to_mut,
                            seq_context,
                            gene_seq,
                            num_permutations=10000,
                            stop_criteria=100,
                            pseudo_count=0,
                            max_batch=25000):
    """Performs null-permutations for deleterious mutation statistics
    in a single gene.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    pseudo_count : int, default: 0
        Pseudo-count for number of deleterious mutations for each
        permutation of the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    del_count_list : list
        list of deleterious mutation counts under the null
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    num_sim = 0
    null_del_ct = 0
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_del_ct >= stop_criteria:
            #j = j - 1
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

        # determine result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calc deleterious mutation info
            tmp_del_count = cutils.calc_deleterious_info(tmp_mut_info['Reference AA'],
                                                         tmp_mut_info['Somatic AA'],
                                                         tmp_mut_info['Codon Pos'])

            # update empricial null distribution
            if tmp_del_count >= obs_del: null_del_ct += 1

            # stop if reach sufficient precision on p-value
            if null_del_ct >= stop_criteria:
                break
        # update number of simulations
        num_sim += i + 1

    #num_sim = j*max_batch + i+1
    del_pval = float(null_del_ct) / (num_sim)

    return del_pval
Exemplo n.º 27
0
def protein_permutation(graph_score,
                        num_codons_obs,
                        context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        gene_graph,
                        num_permutations=10000,
                        stop_criteria=100,
                        pseudo_count=0):
    """Performs null-simulations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    graph_score : float
        clustering score for observed data
    num_codons_obs : int
        number of codons with missense mutation in observed data
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.

    Returns
    -------
    protein_pval : float
        p-value for clustering in neighbor graph constructure from protein
        structures
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

    # calculate position-based statistics as a result of random positions
    null_graph_entropy_ct = 0
    coverage_list = []
    num_mut_list = []
    graph_entropy_list = []
    for i, row in enumerate(tmp_mut_pos):
        # calculate the expected value of the relative increase in coverage
        if i == stop_criteria-1:
            rel_inc = [coverage_list[k] / float(num_mut_list[k])
                       for k in range(stop_criteria-1)
                       if coverage_list[k]]
            exp_rel_inc = np.mean(rel_inc)

            # calculate observed statistic
            if num_codons_obs:
                obs_stat = graph_score / np.log2(exp_rel_inc*num_codons_obs)
            else:
                obs_stat = 1.0

            # calculate statistics for simulated data
            sim_stat_list = [ent / np.log2(exp_rel_inc*num_mut_list[l])
                             for l, ent in enumerate(graph_entropy_list)]
            null_graph_entropy_ct = len([s for s in sim_stat_list
                                         if s-utils.epsilon <= obs_stat])

        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # calculate position info
        tmp_tuple = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
                                         tmp_mut_info['Reference AA'],
                                         tmp_mut_info['Somatic AA'],
                                         pseudo_count=pseudo_count,
                                         is_obs=0)
        _, _, _, tmp_pos_ct = tmp_tuple

        # record num of mut codons
        if i < stop_criteria-1:
            tmp_num_mut_codons = len(tmp_pos_ct)
            num_mut_list.append(tmp_num_mut_codons)

        # get entropy on graph-smoothed probability distribution
        tmp_graph_entropy, tmp_coverage = scores.compute_ng_stat(gene_graph, tmp_pos_ct)

        # record the "coverage" in the graph
        if i < stop_criteria-1:
            coverage_list.append(tmp_coverage)
            graph_entropy_list.append(tmp_graph_entropy)

        # update empirical null distribution counts
        if i >= stop_criteria:
            #if tmp_graph_entropy-utils.epsilon <= graph_score:
            if tmp_num_mut_codons:
                sim_stat = tmp_graph_entropy / np.log2(exp_rel_inc*tmp_num_mut_codons)
            else:
                sim_stat = 1.0

            # add count
            if sim_stat-utils.epsilon <= obs_stat:
                null_graph_entropy_ct += 1

        # stop iterations if reached sufficient precision
        if null_graph_entropy_ct >= stop_criteria:
            break

    # calculate p-value from empirical null-distribution
    protein_pval = float(null_graph_entropy_ct) / (i+1)

    return protein_pval, obs_stat
Exemplo n.º 28
0
def singleprocess_permutation(info):
    bed_list, mut_df, opts = info
    current_chrom = bed_list[0].chrom
    logger.info('Working on chromosome: {0} . . .'.format(current_chrom))
    num_iterations = opts['num_iterations']
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # go through each gene to perform simulation
    result = []
    for bed in bed_list:
        # compute context counts and somatic bases for each context
        gene_tuple = mc.compute_mutation_context(bed, gs, mut_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if context_to_mutations:
            ## get information about observed non-silent counts
            if opts['summary'] and not num_iterations:
                tmp_mut_info = mc.get_aa_mut_info(
                    mutations_df['Coding Position'],
                    mutations_df['Tumor_Allele'].tolist(), gs)
                # calc mutation info summarizing observed mutations
                tmp_result = cutils.calc_summary_info(
                    tmp_mut_info['Reference AA'],
                    tmp_mut_info['Somatic AA'],
                    tmp_mut_info['Codon Pos'],
                    bed.gene_name,
                    opts['score_dir'],
                    min_frac=opts['fraction'],
                    min_recur=opts['recurrent'])
                tmp_result = [[bed.gene_name, 'NA', bed.cds_len] + tmp_result]
            ## Just record protein changes in MAF
            elif opts['maf'] and not num_iterations:
                # input code for just annotating genes mutations
                tmp_result = anot.annotate_maf(
                    mutations_df['Coding Position'],
                    mutations_df['Tumor_Allele'].tolist(), gs)
                # add tumor sample / tumor type info to output
                tmp_result = [
                    line + [
                        mutations_df['Tumor_Sample'].iloc[i],
                        mutations_df['Tumor_Type'].iloc[i]
                    ] for i, line in enumerate(tmp_result)
                ]
            ## Do permutations
            elif opts['maf']:
                # if user specified MAF format then output all mutations in
                # MAF format
                tmp_result = pm.maf_permutation(context_cts,
                                                context_to_mutations, sc, gs,
                                                num_iterations)
            else:
                # Summarized results for feature for each simulation for each
                # gene
                tmp_result = pm.summary_permutation(
                    context_cts,
                    context_to_mutations,
                    sc,  # sequence context obj
                    gs,  # gene sequence obj
                    opts['score_dir'],
                    num_iterations,
                    min_frac=opts['fraction'],
                    min_recur=opts['recurrent'])
            result += tmp_result

    gene_fa.close()
    logger.info('Finished working on chromosome: {0}.'.format(current_chrom))
    return result
Exemplo n.º 29
0
def deleterious_permutation(obs_del,
                            context_counts,
                            context_to_mut,
                            seq_context,
                            gene_seq,
                            num_permutations=10000,
                            stop_criteria=100,
                            pseudo_count=0,
                            max_batch=25000):
    """Performs null-permutations for deleterious mutation statistics
    in a single gene.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    pseudo_count : int, default: 0
        Pseudo-count for number of deleterious mutations for each
        permutation of the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    del_count_list : list
        list of deleterious mutation counts under the null
    """
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    num_sim = 0
    null_del_ct = 0
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_del_ct >= stop_criteria:
            #j = j - 1
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

        # determine result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calc deleterious mutation info
            tmp_del_count = cutils.calc_deleterious_info(tmp_mut_info['Reference AA'],
                                                         tmp_mut_info['Somatic AA'],
                                                         tmp_mut_info['Codon Pos'])

            # update empricial null distribution
            if tmp_del_count >= obs_del: null_del_ct += 1

            # stop if reach sufficient precision on p-value
            if null_del_ct >= stop_criteria:
                break
        # update number of simulations
        num_sim += i + 1

    #num_sim = j*max_batch + i+1
    del_pval = float(null_del_ct) / (num_sim)

    return del_pval
Exemplo n.º 30
0
def calc_protein_p_value(mut_info,
                         unmapped_mut_info,
                         sc,
                         gs,
                         bed,
                         graph_dir,
                         num_permutations,
                         stop_thresh,
                         min_recurrent,
                         min_fraction):
    """Computes the p-value for clustering on a neighbor graph composed
    of codons connected with edges if they are spatially near in 3D protein
    structure.

    Parameters
    ----------


    Returns
    -------

    """
    if len(mut_info) > 0:
        mut_info['Coding Position'] = mut_info['Coding Position'].astype(int)
        mut_info['Context'] = mut_info['Coding Position'].apply(lambda x: sc.pos2context[x])

        # group mutations by context
        cols = ['Context', 'Tumor_Allele']
        unmapped_mut_df = pd.DataFrame(unmapped_mut_info)
        tmp_df = pd.concat([mut_info[cols], unmapped_mut_df[cols]])
        context_cts = tmp_df['Context'].value_counts()
        context_to_mutations = dict((name, group['Tumor_Allele'])
                                    for name, group in tmp_df.groupby('Context'))

        # get vest scores for gene if directory provided
        if graph_dir:
            gene_graph = scores.read_neighbor_graph_pickle(bed.gene_name, graph_dir)
            if gene_graph is None:
                logger.warning('Could not find neighbor graph for {0}, skipping . . .'.format(bed.gene_name))
        else:
            gene_graph = None

        # get recurrent info for actual mutations
        aa_mut_info = mc.get_aa_mut_info(mut_info['Coding Position'],
                                         mut_info['Tumor_Allele'].tolist(),
                                         gs)
        codon_pos = aa_mut_info['Codon Pos'] + unmapped_mut_info['Codon Pos']
        ref_aa = aa_mut_info['Reference AA'] + unmapped_mut_info['Reference AA']
        somatic_aa = aa_mut_info['Somatic AA'] + unmapped_mut_info['Somatic AA']
        num_recurrent, pos_ent, delta_pos_ent, pos_ct = cutils.calc_pos_info(codon_pos,
                                                                             ref_aa,
                                                                             somatic_aa,
                                                                             min_frac=min_fraction,
                                                                             min_recur=min_recurrent)
        try:
            # get vest score for actual mutations
            graph_score, coverage = scores.compute_ng_stat(gene_graph, pos_ct)

            # perform simulations to get p-value
            protein_p_value, norm_graph_score = pm.protein_permutation(
                graph_score, len(pos_ct), context_cts,
                context_to_mutations,
                sc,  # sequence context obj
                gs,  # gene sequence obj
                gene_graph, num_permutations, stop_thresh
            )
        except Exception as err:
            exc_info = sys.exc_info()
            norm_graph_score = 0.0
            protein_p_value = 1.0
            logger.warning('Codon numbering problem with '+bed.gene_name)

    else:
        norm_graph_score = 0.0
        protein_p_value = 1.0
        num_recurrent = 0

    result = [bed.gene_name, num_recurrent, norm_graph_score, protein_p_value]
    return result
Exemplo n.º 31
0
def maf_permutation(context_counts,
                    context_to_mut,
                    seq_context,
                    gene_seq,
                    num_permutations=10000,
                    drop_silent=False):
    """Performs null-permutations across all genes and records the results in
    a format like a MAF file. This could be useful for examining the null
    permutations because the alternative approaches always summarize the results.
    With the simulated null-permutations, novel metrics can be applied to create
    an empirical null-distribution.

    Parameters
    ----------
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    drop_silent : bool, default=False
        Flage on whether to drop all silent mutations. Some data sources
        do not report silent mutations, and the simulations should match this.

    Returns
    -------
    maf_list : list of tuples
        list of null mutations with mutation info in a MAF like format
    """
    mycontexts = context_counts.index.tolist()
    somatic_base, base_context = zip(*[(base, one_context)
                                       for one_context in mycontexts
                                       for base in context_to_mut[one_context]])

    # get random positions determined by sequence context
    tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                            num_permutations)
    tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

    # info about gene
    gene_name = gene_seq.bed.gene_name
    strand = gene_seq.bed.strand
    chrom = gene_seq.bed.chrom
    gene_seq.bed.init_genome_coordinates()  # map seq pos to genome

    # determine result of random positions
    maf_list = []
    for row in tmp_mut_pos:
        # get genome coordinate
        pos2genome = np.vectorize(lambda x: gene_seq.bed.seqpos2genome[x]+1)
        genome_coord = pos2genome(row)

        # get info about mutations
        tmp_mut_info = mc.get_aa_mut_info(row,
                                          somatic_base,
                                          gene_seq)

        # get string describing variant
        var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                      tmp_mut_info['Somatic AA'],
                                                      tmp_mut_info['Codon Pos'])

        # prepare output
        for k, mysomatic_base in enumerate(somatic_base):
            # format DNA change
            ref_nuc = tmp_mut_info['Reference Nuc'][k]
            nuc_pos = row[k]
            dna_change = 'c.{0}{1}>{2}'.format(ref_nuc, nuc_pos, mysomatic_base)

            # format protein change
            ref_aa = tmp_mut_info['Reference AA'][k]
            somatic_aa = tmp_mut_info['Somatic AA'][k]
            codon_pos = tmp_mut_info['Codon Pos'][k]
            protein_change = 'p.{0}{1}{2}'.format(ref_aa, codon_pos, somatic_aa)

            # reverse complement if on negative strand
            if strand == '-':
                ref_nuc = utils.rev_comp(ref_nuc)
                mysomatic_base = utils.rev_comp(mysomatic_base)

            # append results
            if drop_silent and var_class[k].decode() == 'Silent': continue
            maf_line = [gene_name, strand, chrom, genome_coord[k], genome_coord[k],
                        ref_nuc, mysomatic_base, base_context[k], dna_change,
                        protein_change, var_class[k].decode()]
            maf_list.append(maf_line)

    return maf_list
Exemplo n.º 32
0
def position_permutation(obs_stat,
                         context_counts,
                         context_to_mut,
                         seq_context,
                         gene_seq,
                         gene_vest=None,
                         num_permutations=10000,
                         stop_criteria=100,
                         pseudo_count=0,
                         max_batch=25000):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : tuple, (recur ct, entropy, delta entropy, mean vest)
        tuple containing the observed statistics
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    pseudo_count : int, default: 0
        Pseudo-count for number of recurrent missense mutations for each
        permutation for the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    num_recur_list : list
        list of recurrent mutation counts under the null
    entropy_list : list
        list of position entropy values under the null
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat
    num_sim = 0 # number of simulations
    null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack(pos_array for base, pos_array in tmp_contxt_pos)

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calculate position info
            tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
                                                                                tmp_mut_info['Reference AA'],
                                                                                tmp_mut_info['Somatic AA'],
                                                                                pseudo_count=pseudo_count,
                                                                                is_obs=0)
            # get vest scores
            if gene_vest:
                tmp_vest = scores.compute_vest_stat(gene_vest,
                                                    tmp_mut_info['Reference AA'],
                                                    tmp_mut_info['Somatic AA'],
                                                    tmp_mut_info['Codon Pos'])
            else:
                tmp_vest = 0.0

            # update empirical null distribution counts
            if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1
            if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1

            # stop iterations if reached sufficient precision
            if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
                break
        # update the number of simulations
        num_sim += i+1

    # calculate p-value from empirical null-distribution
    ent_pval = float(null_entropy_ct) / (num_sim)
    vest_pval = float(null_vest_ct) / (num_sim)

    return ent_pval, vest_pval
Exemplo n.º 33
0
def position_permutation(obs_stat,
                         context_counts,
                         context_to_mut,
                         seq_context,
                         gene_seq,
                         gene_vest=None,
                         num_permutations=10000,
                         stop_criteria=100,
                         pseudo_count=0,
                         max_batch=25000):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : tuple, (recur ct, entropy, delta entropy, mean vest)
        tuple containing the observed statistics
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    pseudo_count : int, default: 0
        Pseudo-count for number of recurrent missense mutations for each
        permutation for the null distribution. Increasing pseudo_count
        makes the statistical test more stringent.

    Returns
    -------
    num_recur_list : list
        list of recurrent mutation counts under the null
    entropy_list : list
        list of position entropy values under the null
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [base
                    for one_context in mycontexts
                    for base in context_to_mut[one_context]]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    obs_recur, obs_ent, obs_delta_ent, obs_vest = obs_stat
    num_sim = 0 # number of simulations
    null_num_recur_ct, null_entropy_ct, null_delta_entropy_ct, null_vest_ct = 0, 0, 0, 0
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack([pos_array for base, pos_array in tmp_contxt_pos])

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row,
                                              somatic_base,
                                              gene_seq)

            # calculate position info
            tmp_recur_ct, tmp_entropy, tmp_delta_entropy, _ = cutils.calc_pos_info(tmp_mut_info['Codon Pos'],
                                                                                tmp_mut_info['Reference AA'],
                                                                                tmp_mut_info['Somatic AA'],
                                                                                pseudo_count=pseudo_count,
                                                                                is_obs=0)
            # get vest scores
            if gene_vest:
                tmp_vest = scores.compute_vest_stat(gene_vest,
                                                    tmp_mut_info['Reference AA'],
                                                    tmp_mut_info['Somatic AA'],
                                                    tmp_mut_info['Codon Pos'])
            else:
                tmp_vest = 0.0

            # update empirical null distribution counts
            if tmp_entropy-utils.epsilon <= obs_ent: null_entropy_ct += 1
            if tmp_vest+utils.epsilon >= obs_vest: null_vest_ct += 1

            # stop iterations if reached sufficient precision
            if null_vest_ct >= stop_criteria and null_entropy_ct >= stop_criteria:
                break
        # update the number of simulations
        num_sim += i+1

    # calculate p-value from empirical null-distribution
    ent_pval = float(null_entropy_ct) / (num_sim)
    vest_pval = float(null_vest_ct) / (num_sim)

    return ent_pval, vest_pval
Exemplo n.º 34
0
def hotmaps_permutation(obs_stat,
                        context_counts,
                        context_to_mut,
                        seq_context,
                        gene_seq,
                        window,
                        num_permutations=10000,
                        stop_criteria=100,
                        max_batch=25000):
    """Performs null-permutations for position-based mutation statistics
    in a single gene.

    Parameters
    ----------
    obs_stat : dict
        dictionary mapping codons to the sum of mutations in a window
    context_counts : pd.Series
        number of mutations for each context
    context_to_mut : dict
        dictionary mapping nucleotide context to a list of observed
        somatic base changes.
    seq_context : SequenceContext
        Sequence context for the entire gene sequence (regardless
        of where mutations occur). The nucleotide contexts are
        identified at positions along the gene.
    gene_seq : GeneSequence
        Sequence of gene of interest
    window : int
        Number of codons to the left/right of a mutate position to consider
        in the window
    num_permutations : int, default: 10000
        number of permutations to create for null
    stop_criteria : int
        stop after stop_criteria iterations are more significant
        then the observed statistic.
    max_batch : int
        maximum number of whole gene simulations to do at once.
        For large number of simulations holding a matrix of M x N,
        where M is the number of mutations and N is the number of simulations,
        can get quite large.

    Returns
    -------
    pvals : dict
        Maps mutated codon position to the calculated p-value
    """
    # get contexts and somatic base
    mycontexts = context_counts.index.tolist()
    somatic_base = [
        base for one_context in mycontexts
        for base in context_to_mut[one_context]
    ]

    # calculate the # of batches for simulations
    max_batch = min(num_permutations, max_batch)
    num_batches = num_permutations // max_batch
    remainder = num_permutations % max_batch
    batch_sizes = [max_batch] * num_batches
    if remainder:
        batch_sizes += [remainder]

    # figure out which position has highest value
    max_key = max(obs_stat, key=(lambda key: obs_stat[key]))

    # setup null dist counts
    null_cts = {k: 0 for k in obs_stat}

    num_sim = 0  # number of simulations
    for j, batch_size in enumerate(batch_sizes):
        # stop iterations if reached sufficient precision
        if null_cts[max_key] >= stop_criteria:
            break

        # get random positions determined by sequence context
        tmp_contxt_pos = seq_context.random_pos(context_counts.iteritems(),
                                                batch_size)
        tmp_mut_pos = np.hstack(pos_array
                                for base, pos_array in tmp_contxt_pos)

        # calculate position-based statistics as a result of random positions
        for i, row in enumerate(tmp_mut_pos):
            # get info about mutations
            tmp_mut_info = mc.get_aa_mut_info(row, somatic_base, gene_seq)

            # calculate position info
            _, tmp_sim = utils.calc_windowed_sum(tmp_mut_info['Codon Pos'],
                                                 tmp_mut_info['Reference AA'],
                                                 tmp_mut_info['Somatic AA'],
                                                 window)

            # update the counts when the empirical null passes the observed
            for tmp_key in tmp_sim:
                val = tmp_sim[tmp_key]
                for key in null_cts:
                    if val >= obs_stat[key]:
                        null_cts[key] += 1

            # update the number of simulations
            num_sim += len(tmp_sim)

            # stop iterations if reached sufficient precision
            if null_cts[max_key] >= stop_criteria:
                break

    # calculate p-value from empirical null-distribution
    pvals = {k: float(null_cts[k]) / (num_sim) for k in obs_stat}

    return pvals