def get_frame(geneseq, gene_HXB2, genename, VERBOSE=0): '''Get the frame by aligning the proteins''' from seqanpy import align_local from Bio.Seq import translate from numpy import argmax geneseq = ''.join(geneseq) gene_HXB2 = ''.join(gene_HXB2) if genename in ('tat1', 'rev1'): gene_HXB2 = gene_HXB2[:len(gene_HXB2) - (len(gene_HXB2) % 3)] elif genename in ('tat2', 'rev2'): gene_HXB2 = gene_HXB2[len(gene_HXB2) % 3:] prot_HXB2 = translate(gene_HXB2) scores = [] for frame in xrange(3): tmp = geneseq[frame:] tmp = tmp[:len(tmp) - (len(tmp) % 3)] tmp = translate(tmp) (score, ali1, ali2) = align_local(prot_HXB2, tmp) scores.append(score) return argmax(scores)
def align_dna(seqstr, refstr, require_full_cover=True): if require_full_cover: (score, alis, alir) = align_overlap(seqstr, refstr) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) alist = alis[start:end] alirt = alir[start:end] else: (score, alis, alir) = align_local(seqstr, refstr) reftrim = alir.replace('-', '') start = refstr.find(reftrim[:50]) end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:]) alist = ('N' * start) + alis + ('N' * (len(refstr) - end)) alirt = refstr[:start] + alir + refstr[end:] return (alist, alirt)
def align_dna(seqstr, refstr, require_full_cover=True): if require_full_cover: (score, alis, alir) = align_overlap(seqstr, refstr) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) alist = alis[start: end] alirt = alir[start: end] else: (score, alis, alir) = align_local(seqstr, refstr) reftrim = alir.replace('-', '') start = refstr.find(reftrim[:50]) end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:]) alist = ('N' * start) + alis + ('N' * (len(refstr) - end)) alirt = refstr[:start] + alir + refstr[end:] return (alist, alirt)
if VERBOSE >= 1: print 'PCR', PCR, if VERBOSE >= 2: print '' cons = build_consensus(bamfilename, len_reference, VERBOSE=VERBOSE, block_len=block_len, reads_per_alignment=n_reads_per_ali, deltamax=deltamax) consm = np.fromstring(cons, 'S1') if VERBOSE >= 2: print 'Reference length:', len_reference, 'consensus:', len(cons), if len_reference != len(cons): from seqanpy import align_local (score, ali1, ali2) = align_local(''.join(refseq), cons) alim1 = np.fromstring(ali1, 'S1') alim2 = np.fromstring(ali2, 'S1') n_diff = (alim1 != alim2).sum() print 'overlap', len(alim1), 'n diffs.', n_diff else: n_diff = (refm != consm).sum() print 'n diffs.', n_diff if save_to_file: if VERBOSE >= 2: print 'Save to file' fn_out = sample.get_consensus_filename(fragment, PCR=PCR) consrec = SeqRecord(Seq(cons, ambiguous_dna), id=samplename+'_consensus',
muts = mutations.loc[mutations['protein'] == protname] # Get the structure struc = pa.get_structure(protname, fdn + fns[protname]) # Get the right chain scores = [] for chain in struc.get_chains(): # I is the compound (inhibitors) used for crystallization if chain.id == 'I': continue #print('{:}, chain {:}'.format(protname, chain.id)) seql = [ d3to1.get(r.get_resname(), 'O') for r in chain.get_residues() ] seq = ''.join(seql) s, a1, a2 = align_local(prot['seq_aa'], seq) scores.append(s) chain = list(struc.get_chains())[np.argmax(s)] seql = [d3to1.get(r.get_resname(), 'O') for r in chain.get_residues()] seq = ''.join(seql) # Flag all mutations for m, mut in muts.iterrows(): mutations.at[m, 'PDB_fn'] = fns[protname] mutations.at[m, 'PDB_id'] = fns[protname].split( '_')[1].upper().split('.')[0] mutations.at[m, 'PDB_chain'] = chain.id s, a1, a2 = align_overlap(seq, mut['context_protein']) # The focal allele is always small and is the only such letter pos_in_context = 4
# Global pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'AAATCGA' output = sap.align_global(seq1, seq2, band=5) print output # Overlap pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2) print output # Overlap pairwise alignment cutting flanks seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2, cut_flanks=True) print output # Ladder pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'TCTAGGGAAACCC' output = sap.align_ladder(seq1, seq2) print output # Local pairwise alignment seq1 = 'AAAGGTCTACCGTAGCCT' seq2 = 'AAGTCTAC' output = sap.align_local(seq1, seq2) print output
print '' cons = build_consensus(bamfilename, len_reference, VERBOSE=VERBOSE, block_len=block_len, reads_per_alignment=n_reads_per_ali, deltamax=deltamax) consm = np.fromstring(cons, 'S1') if VERBOSE >= 2: print 'Reference length:', len_reference, 'consensus:', len( cons), if len_reference != len(cons): from seqanpy import align_local (score, ali1, ali2) = align_local(''.join(refseq), cons) alim1 = np.fromstring(ali1, 'S1') alim2 = np.fromstring(ali2, 'S1') n_diff = (alim1 != alim2).sum() print 'overlap', len(alim1), 'n diffs.', n_diff else: n_diff = (refm != consm).sum() print 'n diffs.', n_diff if save_to_file: if VERBOSE >= 2: print 'Save to file' fn_out = sample.get_consensus_filename(fragment, PCR=PCR) consrec = SeqRecord( Seq(cons, ambiguous_dna),