afs = afs[:, :2718]

    # Find consensus sequence
    consind = afs.argmax(axis=0)
    consensus = alpha[consind]

    # Some seqs are not viable and include frameshifts that mess up the
    # translation, hence we restrict to positions for which gaps are a minority
    is_gap = consensus == '-'
    # Exclude full codons
    tmp = np.unique(is_gap.nonzero()[0] / 3)
    is_gap[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True

    # Exclude stop codons
    is_stop = np.zeros_like(is_gap)
    tmp = (translate(consensus) == '*').nonzero()[0]
    is_stop[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True

    ## Plot base prevalence
    #for i in xrange(4):
    #    plt.plot(np.arange(len(consensus)), afs[i],
    #             lw=1.5, alpha=0.5)
    #plt.xlim(0, 2600)
    #plt.ylim(-0.05, 1.25)
    #plt.xlabel('position in '+gene)
    #plt.ylabel('allele frequency')
    #plt.legend(alpha, loc=9)

    # Good are codons with no gaps and no stops
    is_good = (-is_gap) & (-is_stop)
示例#2
0
            afs = afs[:, :2718]

        # Find consensus sequence
        consind = afs.argmax(axis=0)
        consensus = alpha[consind]

        # Some seqs are not viable and include frameshifts that mess up the
        # translation, hence we restrict to positions for which gaps are a minority
        is_gap = consensus == '-'
        # Exclude full codons
        tmp = np.unique(is_gap.nonzero()[0] / 3)
        is_gap[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True

        # Exclude stop codons
        is_stop = np.zeros_like(is_gap)
        tmp = (translate(consensus) == '*').nonzero()[0]
        is_stop[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True

        # Good are codons with no gaps and no stops
        is_good = (-is_gap) & (-is_stop)

        # For each codon, calculate the entropy
        msa = msa[:, is_good]
        consaa = translate(consensus[is_good])
        entropy = np.zeros(len(consaa))
        from collections import Counter
        for i, aa in enumerate(consaa):
            tmp = msa[:, i * 3:(i + 1) * 3]
            count = Counter(map(''.join, tmp))
            abundances = []
            for (cod, abundance) in count.iteritems():
                    is_mut = np.array(
                        [alpha[al[0]] != consensus[al[1]] for al in alleles],
                        bool)
                    alleles = alleles[is_mut]

                # Keep only synonymous
                all_cla = {x: [] for x in classes}
                if len(alleles):
                    is_syn = np.zeros(len(alleles), bool)
                    for j, al in enumerate(alleles):
                        pos = al[1]
                        mut = alpha[al[0]]
                        codcons = consensus[pos - pos % 3:pos - pos % 3 + 3]
                        cod = codcons.copy()
                        cod[pos % 3] = mut
                        aacons = translate(codcons)
                        aa = translate(cod)
                        is_syn[j] = (aacons == aa)

                    alleles = alleles[is_syn]

                # Test more stringently for synonymity
                # (we want to avoid double-hits in one single codon)
                if len(alleles):
                    seqs = np.array(p.seqs_from_visit(p.visit[i]))
                    is_single = np.zeros(len(alleles), bool)
                    for j, al in enumerate(alleles):
                        pos = al[1]
                        mut = alpha[al[0]]
                        # Check whether sequences have double-hits
                        codcons = consensus[pos - pos % 3:pos - pos % 3 + 3]
                if not len(alleles):
                    continue

                ###############################################################
                # Filter only synonymous/nonsynonymous changes
                ###############################################################
                # First test
                is_syn = np.zeros(len(alleles), bool)
                for j, al in enumerate(alleles):
                    pos = al[1]
                    mut = alpha[al[0]]
                    codcons = consensus[pos - pos % 3:pos - pos % 3 + 3]
                    cod = codcons.copy()
                    cod[pos % 3] = mut
                    is_syn[j] = (translate(codcons) == translate(cod))

                alleles_syn = alleles[is_syn]
                alleles_nonsyn = alleles[-is_syn]

                # Test more stringently (avoid double-hits in one single codon)
                if len(alleles_syn):
                    seqs = np.array(p.seqs_from_visit(p.visit[i]))
                    is_single = np.zeros(len(alleles_syn), bool)
                    for j, al in enumerate(alleles_syn):
                        pos = al[1]
                        mut = alpha[al[0]]
                        # Check whether sequences have double-hits
                        # If a double mutant is *ever* observed, discard the allele.
                        # Note: This is a very conservative measure and must
                        # be avoided when estimating densities.