def _force_alphabet(record_iterator, alphabet): """Iterate over records, over-riding the alphabet (PRIVATE).""" # Assume the alphabet argument has been pre-validated given_base_class = _get_base_alphabet(alphabet).__class__ for record in record_iterator: if isinstance(_get_base_alphabet(record.seq.alphabet), given_base_class): record.seq.alphabet = alphabet yield record else: raise ValueError("Specified alphabet %r clashes with " "that determined from the file, %r" % (alphabet, record.seq.alphabet))
def _force_alphabet(record_iterator, alphabet): """Iterate over records, over-riding the alphabet (PRIVATE).""" #Assume the alphabet argument has been pre-validated given_base_class = _get_base_alphabet(alphabet).__class__ for record in record_iterator: if isinstance(_get_base_alphabet(record.seq.alphabet), given_base_class): record.seq.alphabet = alphabet yield record else: raise ValueError("Specified alphabet %s clashes with "\ "that determined from the file, %s" \ % (repr(alphabet), repr(record.seq.alphabet)))
def _force_alphabet(alignment_iterator, alphabet): """Iterate over alignments, over-riding the alphabet (PRIVATE).""" # Assume the alphabet argument has been pre-validated given_base_class = _get_base_alphabet(alphabet).__class__ for align in alignment_iterator: if not isinstance(_get_base_alphabet(align._alphabet), given_base_class): raise ValueError("Specified alphabet %s clashes with " "that determined from the file, %s" % (repr(alphabet), repr(align._alphabet))) for record in align: if not isinstance(_get_base_alphabet(record.seq.alphabet), given_base_class): raise ValueError("Specified alphabet %s clashes with " "that determined from the file, %s" % (repr(alphabet), repr(record.seq.alphabet))) record.seq.alphabet = alphabet align._alphabet = alphabet yield align
print("Name: %s" % cur_record.name) print("Description %s" % cur_record.description) print("Annotations***") ann_keys = sorted(cur_record.annotations) for ann_key in ann_keys: if ann_key != 'references': print("Key: %s" % ann_key) print("Value: %s" % cur_record.annotations[ann_key]) else: print("References*") for reference in cur_record.annotations[ann_key]: print(str(reference)) print("Features") for feature in cur_record.features: print(feature) if isinstance(_get_base_alphabet(cur_record.seq.alphabet), ProteinAlphabet): assert feature.strand is None else: # Assuming no mixed strand examples... assert feature.strand is not None print("DB cross refs %s" % cur_record.dbxrefs) elif isinstance(parser, GenBank.RecordParser): print("***Record from %s with the RecordParser" % filename.split(os.path.sep)[-1]) print("sequence length: %i" % len(cur_record.sequence)) print("locus: %s" % cur_record.locus) print("definition: %s" % cur_record.definition) print("accession: %s" % cur_record.accession) for reference in cur_record.references: print("reference title: %s" % reference.title)
print("Name: %s" % cur_record.name) print("Description %s" % cur_record.description) print("Annotations***") ann_keys = sorted(cur_record.annotations) for ann_key in ann_keys: if ann_key != "references": print("Key: %s" % ann_key) print("Value: %s" % cur_record.annotations[ann_key]) else: print("References*") for reference in cur_record.annotations[ann_key]: print(str(reference)) print("Feaures") for feature in cur_record.features: print(feature) if isinstance(_get_base_alphabet(cur_record.seq.alphabet), ProteinAlphabet): assert feature.strand is None else: # Assuming no mixed strand examples... assert feature.strand is not None print("DB cross refs %s" % cur_record.dbxrefs) elif isinstance(parser, GenBank.RecordParser): print("***Record from %s with the RecordParser" % filename.split(os.path.sep)[-1]) print("sequence length: %i" % len(cur_record.sequence)) print("locus: %s" % cur_record.locus) print("definition: %s" % cur_record.definition) print("accession: %s" % cur_record.accession) for reference in cur_record.references: print("reference title: %s" % reference.title) for feature in cur_record.features:
def build(pro_align, nucl_seqs, corr_dict=None, gap_char='-', unknown='X', codon_table=default_codon_table, alphabet=None, complete_protein=False, anchor_len=10, max_score=10): """Build a codon alignment from protein alignment and corresponding nucleotides. Arguments: - pro_align - a protein MultipleSeqAlignment object - nucl_align - an object returned by SeqIO.parse or SeqIO.index or a collection of SeqRecord. - alphabet - alphabet for the returned codon alignment - corr_dict - a dict that maps protein id to nucleotide id - complete_protein - whether the sequence begins with a start codon - frameshift - whether to apply frameshift detection Return a CodonAlignment object >>> from Bio.Alphabet import IUPAC >>> from Bio.Seq import Seq >>> from Bio.SeqRecord import SeqRecord >>> from Bio.Align import MultipleSeqAlignment >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1') >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2') >>> aln = MultipleSeqAlignment([pro1, pro2]) >>> codon_aln = build(aln, [seq1, seq2]) >>> print(codon_aln) CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons) TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1 TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2 """ # TODO # add an option to allow the user to specify the returned object? from Bio.Alphabet import ProteinAlphabet from Bio.Align import MultipleSeqAlignment # check the type of object of pro_align if not isinstance(pro_align, MultipleSeqAlignment): raise TypeError("the first argument should be a MultipleSeqAlignment " "object") # check the alphabet of pro_align for pro in pro_align: if not isinstance(_get_base_alphabet(pro.seq.alphabet), ProteinAlphabet): raise TypeError("Alphabet Error!\nThe input alignment should be " "a *PROTEIN* alignemnt, found %r" % pro.seq.alphabet) if alphabet is None: alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char) # check whether the number of seqs in pro_align and nucl_seqs is # the same pro_num = len(pro_align) if corr_dict is None: if nucl_seqs.__class__.__name__ == "generator": # nucl_seqs will be a tuple if read by SeqIO.parse() nucl_seqs = tuple(nucl_seqs) nucl_num = len(nucl_seqs) if pro_num > nucl_num: raise ValueError("Higher Number of SeqRecords in Protein Alignment " "({0}) than the Number of Nucleotide SeqRecords " "({1}) are found!".format(pro_num, nucl_num)) # Determine the protein sequences and nucl sequences # correspondence. If nucl_seqs is a list, tuple or read by # SeqIO.parse(), we assume the order of sequences in pro_align # and nucl_seqs are the same. If nucl_seqs is a dict or read by # SeqIO.index(), we match seqs in pro_align and those in # nucl_seq by their id. if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"): corr_method = 1 elif nucl_seqs.__class__.__name__ in ("list", "tuple"): corr_method = 0 else: raise TypeError("Nucl Sequences Error, Unknown type to assign " "correspondence method") else: if not isinstance(corr_dict, dict): raise TypeError("corr_dict should be a dict that corresponds " "protein id to nucleotide id!") if len(corr_dict) >= pro_num: # read by SeqIO.parse() if nucl_seqs.__class__.__name__ == "generator": from Bio import SeqIO nucl_seqs = SeqIO.to_dict(nucl_seqs) elif nucl_seqs.__class__.__name__ in ("list", "tuple"): nucl_seqs = dict((i.id, i) for i in nucl_seqs) # nucl_seqs = {i.id: i for i in nucl_seqs} elif nucl_seqs.__class__.__name__ in \ ("_IndexedSeqFileDict", "dict"): pass else: raise TypeError("Nucl Sequences Error, Unknown type of " "Nucleotide Records!") corr_method = 2 else: raise RuntimeError("Number of items in corr_dict ({0}) is less " "than number of protein records " "({1})".format(len(corr_dict), pro_num)) # set up pro-nucl correspondence based on corr_method # corr_method = 0, consecutive pairing if corr_method == 0: pro_nucl_pair = zip(pro_align, nucl_seqs) # corr_method = 1, keyword pairing elif corr_method == 1: nucl_id = set(nucl_seqs.keys()) pro_id = set(i.id for i in pro_align) # check if there is pro_id that does not have a nucleotide match if pro_id - nucl_id: diff = pro_id - nucl_id raise ValueError("Protein Record {0} cannot find a nucleotide " "sequence match, please check the " "id".format(', '.join(diff))) else: pro_nucl_pair = [] for pro_rec in pro_align: pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id])) # corr_method = 2, dict pairing elif corr_method == 2: pro_nucl_pair = [] for pro_rec in pro_align: try: nucl_id = corr_dict[pro_rec.id] except KeyError: print("Protein record (%s) is not in corr_dict!" % pro_rec.id) exit(1) pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id])) codon_aln = [] shift = False for pair in pro_nucl_pair: # Beware that the following span corresponds to an ungapped # nucleotide sequence. corr_span = _check_corr(pair[0], pair[1], gap_char=gap_char, codon_table=codon_table, complete_protein=complete_protein, anchor_len=anchor_len) if not corr_span: raise ValueError("Protein Record {0} and Nucleotide Record {1} do" " not match!".format(pair[0].id, pair[1].id)) else: codon_rec = _get_codon_rec(pair[0], pair[1], corr_span, alphabet=alphabet, complete_protein=False, codon_table=codon_table, max_score=max_score) codon_aln.append(codon_rec) if corr_span[1] == 2: shift = True if shift: return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet) else: return CodonAlignment(codon_aln, alphabet=alphabet)
def _check_corr(pro, nucl, gap_char='-', codon_table=default_codon_table, complete_protein=False, anchor_len=10): """Check if the nucleotide can be translated into the protein (PRIVATE). Expects two SeqRecord objects. """ import re from Bio.Alphabet import NucleotideAlphabet if not isinstance(pro, SeqRecord) or not isinstance(nucl, SeqRecord): raise TypeError("_check_corr accepts two SeqRecord object. Please " "check your input.") def get_alpha(alpha): if hasattr(alpha, 'alphabet'): return get_alpha(alpha.alphabet) else: return alpha if not isinstance(_get_base_alphabet(get_alpha(nucl.seq.alphabet)), NucleotideAlphabet): raise TypeError("Alphabet for nucl should be an instance of " "NucleotideAlphabet, {0} " "detected".format(str(nucl.seq.alphabet))) aa2re = _get_aa_regex(codon_table) pro_re = "" for aa in pro.seq: if aa != gap_char: pro_re += aa2re[aa] nucl_seq = str(nucl.seq.upper().ungap(gap_char)) match = re.search(pro_re, nucl_seq) if match: # mode = 0, direct match return (match.span(), 0) else: # Might caused by mismatches or frameshift, using anchors to # have a try # anchor_len = 10 # adjust this value to test performance pro_seq = str(pro.seq).replace(gap_char, "") anchors = [pro_seq[i:(i + anchor_len)] for i in range(0, len(pro_seq), anchor_len)] # if the last anchor is less than the specified anchor # size, we combine the penultimate and the last anchor # together as the last one. # TODO: modify this to deal with short sequence with only # one anchor. if len(anchors[-1]) < anchor_len: anchors[-1] = anchors[-2] + anchors[-1] pro_re = [] anchor_distance = 0 anchor_pos = [] for i, anchor in enumerate(anchors): this_anchor_len = len(anchor) qcodon = "" fncodon = "" # dirty code to deal with the last anchor # as the last anchor is combined in the steps # above, we need to get the true last anchor to # pro_re if this_anchor_len == anchor_len: for aa in anchor: if complete_protein and i == 0: qcodon += _codons2re(codon_table.start_codons) fncodon += aa2re['X'] continue qcodon += aa2re[aa] fncodon += aa2re['X'] match = re.search(qcodon, nucl_seq) elif this_anchor_len > anchor_len: last_qcodon = "" last_fcodon = "" for j in range(anchor_len, len(anchor)): last_qcodon += aa2re[anchor[j]] last_fcodon += aa2re['X'] match = re.search(last_qcodon, nucl_seq) # build full_pro_re from anchors if match: anchor_pos.append((match.start(), match.end(), i)) if this_anchor_len == anchor_len: pro_re.append(qcodon) else: pro_re.append(last_qcodon) else: if this_anchor_len == anchor_len: pro_re.append(fncodon) else: pro_re.append(last_fcodon) full_pro_re = "".join(pro_re) match = re.search(full_pro_re, nucl_seq) if match: # mode = 1, mismatch return (match.span(), 1) else: # check frames of anchors # ten frameshift events are allowed in a sequence first_anchor = True shift_id_pos = 0 # check the first anchor if first_anchor and anchor_pos[0][2] != 0: shift_val_lst = [1, 2, 3 * anchor_len - 2, 3 * anchor_len - 1, 0] sh_anc = anchors[0] for shift_val in shift_val_lst: if shift_val == 0: qcodon = None break if shift_val in (1, 2): sh_nuc_len = anchor_len * 3 + shift_val elif shift_val in (3 * anchor_len - 2, 3 * anchor_len - 1): sh_nuc_len = anchor_len * 3 - (3 * anchor_len - shift_val) if anchor_pos[0][0] >= sh_nuc_len: sh_nuc = nucl_seq[anchor_pos[0][0] - sh_nuc_len:anchor_pos[0][0]] else: # this is unlikely to produce the correct output sh_nuc = nucl_seq[:anchor_pos[0][0]] qcodon, shift_id_pos = _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos) if qcodon is not None and qcodon != -1: # pro_re[0] should be '.'*anchor_len, therefore I # replace it. pro_re[0] = qcodon break if qcodon == -1: warnings.warn("first frameshift detection failed for " "{0}".format(nucl.id), BiopythonWarning) # check anchors in the middle for i in range(len(anchor_pos) - 1): shift_val = (anchor_pos[i + 1][0] - anchor_pos[i][0]) % \ (3 * anchor_len) sh_anc = "".join(anchors[anchor_pos[i][2]:anchor_pos[i + 1][2]]) sh_nuc = nucl_seq[anchor_pos[i][0]:anchor_pos[i + 1][0]] qcodon = None if shift_val != 0: qcodon, shift_id_pos = _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos) if qcodon is not None and qcodon != -1: pro_re[anchor_pos[i][2]:anchor_pos[i + 1][2]] = [qcodon] qcodon = None elif qcodon == -1: warnings.warn("middle frameshift detection failed for " "{0}".format(nucl.id), BiopythonWarning) # check the last anchor if anchor_pos[-1][2] + 1 == len(anchors) - 1: sh_anc = anchors[-1] this_anchor_len = len(sh_anc) shift_val_lst = [1, 2, 3 * this_anchor_len - 2, 3 * this_anchor_len - 1, 0] for shift_val in shift_val_lst: if shift_val == 0: qcodon = None break if shift_val in (1, 2): sh_nuc_len = this_anchor_len * 3 + shift_val elif shift_val in \ (3 * this_anchor_len - 2, 3 * this_anchor_len - 1): sh_nuc_len = this_anchor_len * 3 - (3 * this_anchor_len - shift_val) if len(nucl_seq) - anchor_pos[-1][0] >= sh_nuc_len: sh_nuc = nucl_seq[anchor_pos[-1][0]:anchor_pos[-1][0] + sh_nuc_len] else: # this is unlikely to produce the correct output sh_nuc = nucl_seq[anchor_pos[-1][0]:] qcodon, shift_id_pos = _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, this_anchor_len, shift_id_pos) if qcodon is not None and qcodon != -1: pro_re.pop() pro_re[-1] = qcodon break if qcodon == -1: warnings.warn("last frameshift detection failed for " "{0}".format(nucl.id), BiopythonWarning) # try global match full_pro_re = "".join(pro_re) match = re.search(full_pro_re, nucl_seq) if match: return (match.span(), 2, match) else: raise RuntimeError("Protein SeqRecord ({0}) and Nucleotide " "SeqRecord ({1}) do not " "match!".format(pro.id, nucl.id))
def build(pro_align, nucl_seqs, corr_dict=None, gap_char='-', unknown='X', codon_table=default_codon_table, alphabet=None, complete_protein=False, anchor_len=10, max_score=10): """Build a codon alignment from protein alignment and corresponding nucleotides. Arguments: - pro_align - a protein MultipleSeqAlignment object - nucl_align - an object returned by SeqIO.parse or SeqIO.index or a collection of SeqRecord. - alphabet - alphabet for the returned codon alignment - corr_dict - a dict that maps protein id to nucleotide id - complete_protein - whether the sequence begins with a start codon - frameshift - whether to apply frameshift detection Return a CodonAlignment object >>> from Bio.Alphabet import IUPAC >>> from Bio.Seq import Seq >>> from Bio.SeqRecord import SeqRecord >>> from Bio.Align import MultipleSeqAlignment >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1') >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2') >>> aln = MultipleSeqAlignment([pro1, pro2]) >>> codon_aln = build(aln, [seq1, seq2]) >>> print(codon_aln) CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons) TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1 TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2 """ # TODO # add an option to allow the user to specify the returned object? from Bio.Alphabet import ProteinAlphabet from Bio.Align import MultipleSeqAlignment # check the type of object of pro_align if not isinstance(pro_align, MultipleSeqAlignment): raise TypeError("the first argument should be a MultipleSeqAlignment " "object") # check the alphabet of pro_align for pro in pro_align: if not isinstance(_get_base_alphabet(pro.seq.alphabet), ProteinAlphabet): raise TypeError("Alphabet Error!\nThe input alignment should be " "a *PROTEIN* alignemnt, found %r" % pro.seq.alphabet) if alphabet is None: alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char) # check whether the number of seqs in pro_align and nucl_seqs is # the same pro_num = len(pro_align) if corr_dict is None: if nucl_seqs.__class__.__name__ == "generator": # nucl_seqs will be a tuple if read by SeqIO.parse() nucl_seqs = tuple(nucl_seqs) nucl_num = len(nucl_seqs) if pro_num > nucl_num: raise ValueError( "Higher Number of SeqRecords in Protein Alignment " "({0}) than the Number of Nucleotide SeqRecords " "({1}) are found!".format(pro_num, nucl_num)) # Determine the protein sequences and nucl sequences # correspondence. If nucl_seqs is a list, tuple or read by # SeqIO.parse(), we assume the order of sequences in pro_align # and nucl_seqs are the same. If nucl_seqs is a dict or read by # SeqIO.index(), we match seqs in pro_align and those in # nucl_seq by their id. if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"): corr_method = 1 elif nucl_seqs.__class__.__name__ in ("list", "tuple"): corr_method = 0 else: raise TypeError("Nucl Sequences Error, Unknown type to assign " "correspondence method") else: if not isinstance(corr_dict, dict): raise TypeError("corr_dict should be a dict that corresponds " "protein id to nucleotide id!") if len(corr_dict) >= pro_num: # read by SeqIO.parse() if nucl_seqs.__class__.__name__ == "generator": from Bio import SeqIO nucl_seqs = SeqIO.to_dict(nucl_seqs) elif nucl_seqs.__class__.__name__ in ("list", "tuple"): nucl_seqs = dict((i.id, i) for i in nucl_seqs) # nucl_seqs = {i.id: i for i in nucl_seqs} elif nucl_seqs.__class__.__name__ in \ ("_IndexedSeqFileDict", "dict"): pass else: raise TypeError("Nucl Sequences Error, Unknown type of " "Nucleotide Records!") corr_method = 2 else: raise RuntimeError("Number of items in corr_dict ({0}) is less " "than number of protein records " "({1})".format(len(corr_dict), pro_num)) # set up pro-nucl correspondence based on corr_method # corr_method = 0, consecutive pairing if corr_method == 0: pro_nucl_pair = zip(pro_align, nucl_seqs) # corr_method = 1, keyword pairing elif corr_method == 1: nucl_id = set(nucl_seqs.keys()) pro_id = set(i.id for i in pro_align) # check if there is pro_id that does not have a nucleotide match if pro_id - nucl_id: diff = pro_id - nucl_id raise ValueError("Protein Record {0} cannot find a nucleotide " "sequence match, please check the " "id".format(', '.join(diff))) else: pro_nucl_pair = [] for pro_rec in pro_align: pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id])) # corr_method = 2, dict pairing elif corr_method == 2: pro_nucl_pair = [] for pro_rec in pro_align: try: nucl_id = corr_dict[pro_rec.id] except KeyError: print("Protein record (%s) is not in corr_dict!" % pro_rec.id) exit(1) pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id])) codon_aln = [] shift = False for pair in pro_nucl_pair: # Beware that the following span corresponds to an ungapped # nucleotide sequence. corr_span = _check_corr(pair[0], pair[1], gap_char=gap_char, codon_table=codon_table, complete_protein=complete_protein, anchor_len=anchor_len) if not corr_span: raise ValueError("Protein Record {0} and Nucleotide Record {1} do" " not match!".format(pair[0].id, pair[1].id)) else: codon_rec = _get_codon_rec(pair[0], pair[1], corr_span, alphabet=alphabet, complete_protein=False, codon_table=codon_table, max_score=max_score) codon_aln.append(codon_rec) if corr_span[1] == 2: shift = True if shift: return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet) else: return CodonAlignment(codon_aln, alphabet=alphabet)
def _check_corr(pro, nucl, gap_char='-', codon_table=default_codon_table, complete_protein=False, anchor_len=10): """Check if the nucleotide can be translated into the protein (PRIVATE). Expects two SeqRecord objects. """ import re from Bio.Alphabet import NucleotideAlphabet if not isinstance(pro, SeqRecord) or not isinstance(nucl, SeqRecord): raise TypeError("_check_corr accepts two SeqRecord object. Please " "check your input.") def get_alpha(alpha): if hasattr(alpha, 'alphabet'): return get_alpha(alpha.alphabet) else: return alpha if not isinstance(_get_base_alphabet(get_alpha(nucl.seq.alphabet)), NucleotideAlphabet): raise TypeError("Alphabet for nucl should be an instance of " "NucleotideAlphabet, {0} " "detected".format(str(nucl.seq.alphabet))) aa2re = _get_aa_regex(codon_table) pro_re = "" for aa in pro.seq: if aa != gap_char: pro_re += aa2re[aa] nucl_seq = str(nucl.seq.upper().ungap(gap_char)) match = re.search(pro_re, nucl_seq) if match: # mode = 0, direct match return (match.span(), 0) else: # Might caused by mismatches or frameshift, using anchors to # have a try # anchor_len = 10 # adjust this value to test performance pro_seq = str(pro.seq).replace(gap_char, "") anchors = [ pro_seq[i:(i + anchor_len)] for i in range(0, len(pro_seq), anchor_len) ] # if the last anchor is less than the specified anchor # size, we combine the penultimate and the last anchor # together as the last one. # TODO: modify this to deal with short sequence with only # one anchor. if len(anchors[-1]) < anchor_len: anchors[-1] = anchors[-2] + anchors[-1] pro_re = [] anchor_distance = 0 anchor_pos = [] for i, anchor in enumerate(anchors): this_anchor_len = len(anchor) qcodon = "" fncodon = "" # dirty code to deal with the last anchor # as the last anchor is combined in the steps # above, we need to get the true last anchor to # pro_re if this_anchor_len == anchor_len: for aa in anchor: if complete_protein and i == 0: qcodon += _codons2re(codon_table.start_codons) fncodon += aa2re['X'] continue qcodon += aa2re[aa] fncodon += aa2re['X'] match = re.search(qcodon, nucl_seq) elif this_anchor_len > anchor_len: last_qcodon = "" last_fcodon = "" for j in range(anchor_len, len(anchor)): last_qcodon += aa2re[anchor[j]] last_fcodon += aa2re['X'] match = re.search(last_qcodon, nucl_seq) # build full_pro_re from anchors if match: anchor_pos.append((match.start(), match.end(), i)) if this_anchor_len == anchor_len: pro_re.append(qcodon) else: pro_re.append(last_qcodon) else: if this_anchor_len == anchor_len: pro_re.append(fncodon) else: pro_re.append(last_fcodon) full_pro_re = "".join(pro_re) match = re.search(full_pro_re, nucl_seq) if match: # mode = 1, mismatch return (match.span(), 1) else: # check frames of anchors # ten frameshift events are allowed in a sequence first_anchor = True shift_id_pos = 0 # check the first anchor if first_anchor and anchor_pos[0][2] != 0: shift_val_lst = [ 1, 2, 3 * anchor_len - 2, 3 * anchor_len - 1, 0 ] sh_anc = anchors[0] for shift_val in shift_val_lst: if shift_val == 0: qcodon = None break if shift_val in (1, 2): sh_nuc_len = anchor_len * 3 + shift_val elif shift_val in (3 * anchor_len - 2, 3 * anchor_len - 1): sh_nuc_len = anchor_len * 3 - (3 * anchor_len - shift_val) if anchor_pos[0][0] >= sh_nuc_len: sh_nuc = nucl_seq[anchor_pos[0][0] - sh_nuc_len:anchor_pos[0][0]] else: # this is unlikely to produce the correct output sh_nuc = nucl_seq[:anchor_pos[0][0]] qcodon, shift_id_pos = _get_shift_anchor_re( sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos) if qcodon is not None and qcodon != -1: # pro_re[0] should be '.'*anchor_len, therefore I # replace it. pro_re[0] = qcodon break if qcodon == -1: warnings.warn( "first frameshift detection failed for " "{0}".format(nucl.id), BiopythonWarning) # check anchors in the middle for i in range(len(anchor_pos) - 1): shift_val = (anchor_pos[i + 1][0] - anchor_pos[i][0]) % \ (3 * anchor_len) sh_anc = "".join(anchors[anchor_pos[i][2]:anchor_pos[i + 1][2]]) sh_nuc = nucl_seq[anchor_pos[i][0]:anchor_pos[i + 1][0]] qcodon = None if shift_val != 0: qcodon, shift_id_pos = _get_shift_anchor_re( sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos) if qcodon is not None and qcodon != -1: pro_re[anchor_pos[i][2]:anchor_pos[i + 1][2]] = [qcodon] qcodon = None elif qcodon == -1: warnings.warn( "middle frameshift detection failed for " "{0}".format(nucl.id), BiopythonWarning) # check the last anchor if anchor_pos[-1][2] + 1 == len(anchors) - 1: sh_anc = anchors[-1] this_anchor_len = len(sh_anc) shift_val_lst = [ 1, 2, 3 * this_anchor_len - 2, 3 * this_anchor_len - 1, 0 ] for shift_val in shift_val_lst: if shift_val == 0: qcodon = None break if shift_val in (1, 2): sh_nuc_len = this_anchor_len * 3 + shift_val elif shift_val in \ (3 * this_anchor_len - 2, 3 * this_anchor_len - 1): sh_nuc_len = this_anchor_len * 3 - ( 3 * this_anchor_len - shift_val) if len(nucl_seq) - anchor_pos[-1][0] >= sh_nuc_len: sh_nuc = nucl_seq[anchor_pos[-1][0]:anchor_pos[-1][0] + sh_nuc_len] else: # this is unlikely to produce the correct output sh_nuc = nucl_seq[anchor_pos[-1][0]:] qcodon, shift_id_pos = _get_shift_anchor_re( sh_anc, sh_nuc, shift_val, aa2re, this_anchor_len, shift_id_pos) if qcodon is not None and qcodon != -1: pro_re.pop() pro_re[-1] = qcodon break if qcodon == -1: warnings.warn( "last frameshift detection failed for " "{0}".format(nucl.id), BiopythonWarning) # try global match full_pro_re = "".join(pro_re) match = re.search(full_pro_re, nucl_seq) if match: return (match.span(), 2, match) else: raise RuntimeError("Protein SeqRecord ({0}) and Nucleotide " "SeqRecord ({1}) do not " "match!".format(pro.id, nucl.id))
def build( pro_align, nucl_seqs, corr_dict=None, gap_char="-", unknown="X", codon_table=default_codon_table, alphabet=None, complete_protein=False, anchor_len=10, max_score=10, ): """Build a codon alignment from protein alignment and corresponding nucleotides. Arguments: - pro_align - a protein MultipleSeqAlignment object - nucl_seqs - an object returned by SeqIO.parse or SeqIO.index or a collection of SeqRecord. - alphabet - alphabet for the returned codon alignment - corr_dict - a dict that maps protein id to nucleotide id - complete_protein - whether the sequence begins with a start codon Return a CodonAlignment object. The example below answers this Biostars question: https://www.biostars.org/p/89741/ >>> from Bio.Alphabet import generic_dna, generic_protein >>> from Bio.Seq import Seq >>> from Bio.SeqRecord import SeqRecord >>> from Bio.Align import MultipleSeqAlignment >>> from Bio.codonalign import build >>> seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1') >>> seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2') >>> pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1') >>> pro2 = SeqRecord(Seq('M-R', alphabet=generic_protein), id='pro2') >>> aln = MultipleSeqAlignment([pro1, pro2]) >>> codon_aln = build(aln, [seq1, seq2]) >>> print(codon_aln) CodonAlphabet(Standard) CodonAlignment with 2 rows and 9 columns (3 codons) ATGTCTCGT pro1 ATG---CGT pro2 """ # TODO # add an option to allow the user to specify the returned object? from Bio.Alphabet import ProteinAlphabet from Bio.Align import MultipleSeqAlignment # check the type of object of pro_align if not isinstance(pro_align, MultipleSeqAlignment): raise TypeError( "the first argument should be a MultipleSeqAlignment object") # check the alphabet of pro_align for pro in pro_align: if not isinstance(_get_base_alphabet(pro.seq.alphabet), ProteinAlphabet): raise TypeError("Alphabet Error!\nThe input alignment should be " "a *PROTEIN* alignemnt, found %r" % pro.seq.alphabet) if alphabet is None: alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char) # check whether the number of seqs in pro_align and nucl_seqs is # the same pro_num = len(pro_align) if corr_dict is None: if nucl_seqs.__class__.__name__ == "generator": # nucl_seqs will be a tuple if read by SeqIO.parse() nucl_seqs = tuple(nucl_seqs) nucl_num = len(nucl_seqs) if pro_num > nucl_num: raise ValueError( f"Higher Number of SeqRecords in Protein Alignment ({pro_num}) " f"than the Number of Nucleotide SeqRecords ({nucl_num}) are found!" ) # Determine the protein sequences and nucl sequences # correspondence. If nucl_seqs is a list, tuple or read by # SeqIO.parse(), we assume the order of sequences in pro_align # and nucl_seqs are the same. If nucl_seqs is a dict or read by # SeqIO.index(), we match seqs in pro_align and those in # nucl_seq by their id. if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"): corr_method = 1 elif nucl_seqs.__class__.__name__ in ("list", "tuple"): corr_method = 0 else: raise TypeError( "Nucl Sequences Error, Unknown type to assign correspondence method" ) else: if not isinstance(corr_dict, dict): raise TypeError("corr_dict should be a dict that corresponds " "protein id to nucleotide id!") if len(corr_dict) >= pro_num: # read by SeqIO.parse() if nucl_seqs.__class__.__name__ == "generator": from Bio import SeqIO nucl_seqs = SeqIO.to_dict(nucl_seqs) elif nucl_seqs.__class__.__name__ in ("list", "tuple"): nucl_seqs = {i.id: i for i in nucl_seqs} elif nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"): pass else: raise TypeError( "Nucl Sequences Error, Unknown type of Nucleotide Records!" ) corr_method = 2 else: raise RuntimeError( f"Number of items in corr_dict ({len(corr_dict)}) " f"is less than number of protein records ({pro_num})") # set up pro-nucl correspondence based on corr_method # corr_method = 0, consecutive pairing if corr_method == 0: pro_nucl_pair = zip(pro_align, nucl_seqs) # corr_method = 1, keyword pairing elif corr_method == 1: nucl_id = set(nucl_seqs.keys()) pro_id = {i.id for i in pro_align} # check if there is pro_id that does not have a nucleotide match if pro_id - nucl_id: diff = pro_id - nucl_id raise ValueError(f"Protein Record {', '.join(diff)} cannot find a " "nucleotide sequence match, please check the id") else: pro_nucl_pair = [] for pro_rec in pro_align: pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id])) # corr_method = 2, dict pairing elif corr_method == 2: pro_nucl_pair = [] for pro_rec in pro_align: try: nucl_id = corr_dict[pro_rec.id] except KeyError: print("Protein record (%s) is not in corr_dict!" % pro_rec.id) exit(1) pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id])) codon_aln = [] shift = False for pair in pro_nucl_pair: # Beware that the following span corresponds to an ungapped # nucleotide sequence. corr_span = _check_corr( pair[0], pair[1], gap_char=gap_char, codon_table=codon_table, complete_protein=complete_protein, anchor_len=anchor_len, ) if not corr_span: raise ValueError(f"Protein Record {pair[0].id} and " f"Nucleotide Record {pair[1].id} do not match!") else: codon_rec = _get_codon_rec( pair[0], pair[1], corr_span, alphabet=alphabet, complete_protein=False, codon_table=codon_table, max_score=max_score, ) codon_aln.append(codon_rec) if corr_span[1] == 2: shift = True if shift: return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet) else: return CodonAlignment(codon_aln, alphabet=alphabet)