def get_frame(geneseq, gene_HXB2, genename, VERBOSE=0):
    '''Get the frame by aligning the proteins'''
    from seqanpy import align_local
    from Bio.Seq import translate
    from numpy import argmax

    geneseq = ''.join(geneseq)
    gene_HXB2 = ''.join(gene_HXB2)

    if genename in ('tat1', 'rev1'):
        gene_HXB2 = gene_HXB2[:len(gene_HXB2) - (len(gene_HXB2) % 3)]
    elif genename in ('tat2', 'rev2'):
        gene_HXB2 = gene_HXB2[len(gene_HXB2) % 3:]

    prot_HXB2 = translate(gene_HXB2)

    scores = []
    for frame in xrange(3):
        tmp = geneseq[frame:]
        tmp = tmp[:len(tmp) - (len(tmp) % 3)]
        tmp = translate(tmp)
        (score, ali1, ali2) = align_local(prot_HXB2, tmp)
        scores.append(score)

    return argmax(scores)
def get_frame(geneseq, gene_HXB2, genename, VERBOSE=0):
    '''Get the frame by aligning the proteins'''
    from seqanpy import align_local
    from Bio.Seq import translate
    from numpy import argmax

    geneseq = ''.join(geneseq)
    gene_HXB2 = ''.join(gene_HXB2)

    if genename in ('tat1', 'rev1'):
        gene_HXB2 = gene_HXB2[:len(gene_HXB2) - (len(gene_HXB2) % 3)]
    elif genename in ('tat2', 'rev2'):
        gene_HXB2 = gene_HXB2[len(gene_HXB2) % 3:]

    prot_HXB2 = translate(gene_HXB2)

    scores = []
    for frame in xrange(3):
        tmp = geneseq[frame:]
        tmp = tmp[:len(tmp) - (len(tmp) % 3)]
        tmp = translate(tmp)
        (score, ali1, ali2) = align_local(prot_HXB2, tmp)
        scores.append(score)

    return argmax(scores)
    def align_dna(seqstr, refstr, require_full_cover=True):
        if require_full_cover:
            (score, alis, alir) = align_overlap(seqstr, refstr)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            alist = alis[start:end]
            alirt = alir[start:end]
        else:
            (score, alis, alir) = align_local(seqstr, refstr)
            reftrim = alir.replace('-', '')
            start = refstr.find(reftrim[:50])
            end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:])
            alist = ('N' * start) + alis + ('N' * (len(refstr) - end))
            alirt = refstr[:start] + alir + refstr[end:]

        return (alist, alirt)
    def align_dna(seqstr, refstr, require_full_cover=True):
        if require_full_cover:
            (score, alis, alir) = align_overlap(seqstr, refstr)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            alist = alis[start: end]
            alirt = alir[start: end]
        else:
            (score, alis, alir) = align_local(seqstr, refstr)
            reftrim = alir.replace('-', '')
            start = refstr.find(reftrim[:50])
            end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:])
            alist = ('N' * start) + alis + ('N' * (len(refstr) - end))
            alirt = refstr[:start] + alir + refstr[end:]

        return (alist, alirt)
            if VERBOSE >= 1:
                print 'PCR', PCR,
                if VERBOSE >= 2:
                    print ''

            cons = build_consensus(bamfilename, len_reference, VERBOSE=VERBOSE,
                                   block_len=block_len,
                                   reads_per_alignment=n_reads_per_ali,
                                   deltamax=deltamax)
            consm = np.fromstring(cons, 'S1')

            if VERBOSE >= 2:
                print 'Reference length:', len_reference, 'consensus:', len(cons),
                if len_reference != len(cons):
                    from seqanpy import align_local
                    (score, ali1, ali2) = align_local(''.join(refseq), cons)
                    alim1 = np.fromstring(ali1, 'S1')
                    alim2 = np.fromstring(ali2, 'S1')
                    n_diff = (alim1 != alim2).sum()
                    print 'overlap', len(alim1), 'n diffs.', n_diff
                else:
                    n_diff = (refm != consm).sum()
                    print 'n diffs.', n_diff

            if save_to_file:
                if VERBOSE >= 2:
                    print 'Save to file'

                fn_out = sample.get_consensus_filename(fragment, PCR=PCR)
                consrec = SeqRecord(Seq(cons, ambiguous_dna),
                                    id=samplename+'_consensus',
        muts = mutations.loc[mutations['protein'] == protname]

        # Get the structure
        struc = pa.get_structure(protname, fdn + fns[protname])
        # Get the right chain
        scores = []
        for chain in struc.get_chains():
            # I is the compound (inhibitors) used for crystallization
            if chain.id == 'I':
                continue
            #print('{:}, chain {:}'.format(protname, chain.id))
            seql = [
                d3to1.get(r.get_resname(), 'O') for r in chain.get_residues()
            ]
            seq = ''.join(seql)
            s, a1, a2 = align_local(prot['seq_aa'], seq)
            scores.append(s)
        chain = list(struc.get_chains())[np.argmax(s)]
        seql = [d3to1.get(r.get_resname(), 'O') for r in chain.get_residues()]
        seq = ''.join(seql)

        # Flag all mutations
        for m, mut in muts.iterrows():
            mutations.at[m, 'PDB_fn'] = fns[protname]
            mutations.at[m, 'PDB_id'] = fns[protname].split(
                '_')[1].upper().split('.')[0]
            mutations.at[m, 'PDB_chain'] = chain.id

            s, a1, a2 = align_overlap(seq, mut['context_protein'])
            # The focal allele is always small and is the only such letter
            pos_in_context = 4
示例#7
0
    # Global pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'AAATCGA'
    output = sap.align_global(seq1, seq2, band=5)
    print output

    # Overlap pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2)
    print output

    # Overlap pairwise alignment cutting flanks
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2, cut_flanks=True)
    print output

    # Ladder pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'TCTAGGGAAACCC'
    output = sap.align_ladder(seq1, seq2)
    print output

    # Local pairwise alignment
    seq1 = 'AAAGGTCTACCGTAGCCT'
    seq2 = 'AAGTCTAC'
    output = sap.align_local(seq1, seq2)
    print output
示例#8
0
                    print ''

            cons = build_consensus(bamfilename,
                                   len_reference,
                                   VERBOSE=VERBOSE,
                                   block_len=block_len,
                                   reads_per_alignment=n_reads_per_ali,
                                   deltamax=deltamax)
            consm = np.fromstring(cons, 'S1')

            if VERBOSE >= 2:
                print 'Reference length:', len_reference, 'consensus:', len(
                    cons),
                if len_reference != len(cons):
                    from seqanpy import align_local
                    (score, ali1, ali2) = align_local(''.join(refseq), cons)
                    alim1 = np.fromstring(ali1, 'S1')
                    alim2 = np.fromstring(ali2, 'S1')
                    n_diff = (alim1 != alim2).sum()
                    print 'overlap', len(alim1), 'n diffs.', n_diff
                else:
                    n_diff = (refm != consm).sum()
                    print 'n diffs.', n_diff

            if save_to_file:
                if VERBOSE >= 2:
                    print 'Save to file'

                fn_out = sample.get_consensus_filename(fragment, PCR=PCR)
                consrec = SeqRecord(
                    Seq(cons, ambiguous_dna),