def check_matching(self) -> tuple: init_clstr = defaultdict(list) dichord_list = [] converted_names = {} related_aligns = None for n, tip in enumerate(self.tree.get_terminals()): tip_name = tip.name try: seq_record = self.aligns_as_seqs[tip_name] dichord = TipSeqLinker( seq_record, (self.tree.root, *self.tree.get_path(tip)) ) except KeyError: raise TipNotMatchedError(tip) init_clstr[tip].append(dichord) dichord_list.append(dichord) new_seq_id = 'seq{}'.format(n) converted_names[tip_name] = new_seq_id converted_names[new_seq_id] = tip_name if related_aligns is None: related_aligns = MultipleSeqAlignment([seq_record]) else: related_aligns.extend([seq_record]) return ( init_clstr, dichord_list, converted_names, tuple(range(related_aligns.get_alignment_length())) )
def test_proteins(self): alpha = HasStopCodon(Gapped(generic_protein, "-"), "*") a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003") ]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) self.assertEqual( str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) ic = s.information_content(chars_to_ignore=['-', '*']) self.assertAlmostEqual(ic, 133.061475107, places=6)
def removecolumnfrommask(seqfile, filetype, mask): outFile = open(seqfile.split('.')[0] + '_masked.fas', 'w+') alignment = AlignIO.read(seqfile, filetype) trimAlign = MultipleSeqAlignment([]) numCol = alignment.get_alignment_length() colToKeep = [] coltoremove = [] for k in open(mask, 'r'): coltoremove.append(int(k.split('\n')[0])) print(len(coltoremove)) for i in range(numCol): if i not in coltoremove: colToKeep.append(i) print(len(colToKeep)) print('if okay remove+keep (', int(len(coltoremove) + len(colToKeep)), ') match ', int(numCol)) for record in alignment: newseq = "" for j in colToKeep: newseq = newseq + (record[j]) newRecord = SeqRecord(Seq(newseq), id=record.id) trimAlign.append(newRecord) if 'SWARM' in record.id: outFile.write('>' + record.id.split('_')[0] + '\n' + newseq + '\n') else: outFile.write('>' + record.id + '\n' + newseq + '\n') outFile.close() print("Total number of columns remaining: %i" % trimAlign.get_alignment_length())
def test_basic_alignment(self): """Basic tests on a simple alignment of three sequences.""" alignment = MultipleSeqAlignment([]) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.append(SeqRecord(Seq(letters), id="mixed")) alignment.append(SeqRecord(Seq(letters.lower()), id="lower")) alignment.append(SeqRecord(Seq(letters.upper()), id="upper")) self.assertEqual(alignment.get_alignment_length(), 26) self.assertEqual(len(alignment), 3) self.assertEqual(str(alignment[0].seq), letters) self.assertEqual(str(alignment[1].seq), letters.lower()) self.assertEqual(str(alignment[2].seq), letters.upper()) self.assertEqual(alignment[0].id, "mixed") self.assertEqual(alignment[1].id, "lower") self.assertEqual(alignment[2].id, "upper") for (col, letter) in enumerate(letters): self.assertEqual(alignment[:, col], letter + letter.lower() + letter.upper()) # Check row extractions: self.assertEqual(alignment[0].id, "mixed") self.assertEqual(alignment[-1].id, "upper") # Check sub-alignment extraction by row slicing: self.assertIsInstance(alignment[::-1], MultipleSeqAlignment) self.assertEqual(alignment[::-1][0].id, "upper") self.assertEqual(alignment[::-1][2].id, "mixed")
def test_proteins(self): a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003") ]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c) self.assertEqual( str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) letters = IUPACData.protein_letters base_freq = 1.0 / len(letters) e_freq_table = {letter: base_freq for letter in letters} ic = s.information_content(e_freq_table=e_freq_table, chars_to_ignore=["-", "*"]) self.assertAlmostEqual(ic, 133.061475107, places=6)
def __init__(self, alignment: MultipleSeqAlignment): """ Constructor. alignment: MultipleSeqAlignment containing the alignment """ self.tree = None # Best tree self.trees = [] # Complete trees self.threshold = np.inf # The score of the currently best tree # Create the dictionary (and list) of taxa self.leaves = [] # Leaves as clades # For every alignment, create corresponding leaves and sets. for sequence in alignment: # Create a list of sets containing bases base_sets = [] for char in sequence.seq: base_sets.append({char}) # Create leaf leaf = ParsimonyClade(None, sequence.id, sets=base_sets, score=0) self.leaves.append(leaf) self.size = len(self.leaves) # Number of alignments self.length = alignment.get_alignment_length( ) # Length of each sequence
def __init__(self, alignment: MultipleSeqAlignment, seed: int = 0): """ Constructor. alignment: MultipleSeqAlignment containing the alignment """ self.seed = seed np.random.seed(seed) self.size = len(alignment) # Number of alignments self.length = alignment.get_alignment_length( ) # Length of each sequence self.nr_of_bases = len(Base) + 1 self.alignment = alignment self.threshold = 10e-5 self.E = 10e-5 # If the number of sequences isn't enough if self.size <= 0: raise ValueError("There aren't enough taxa.") # If there is only one taxon if self.size == 1: inner_clade = SOTAClade(None, None) first_cell = SOTAClade(None, alignment[0].id) inner_clade.clades.append(first_cell) self.tree = self.create_tree(inner_clade) # If there are only 2 taxa elif self.size == 2: inner_clade = SOTAClade(None, None) first_cell = SOTAClade(None, alignment[0].id) second_cell = SOTAClade(None, alignment[1].id) inner_clade.clades.append(first_cell) inner_clade.clades.append(second_cell) self.tree = self.create_tree(inner_clade) # In any other case else: # Sequences to classify. Dimension: number of taxa, number of different bases + 1, length of sequences self.S = np.zeros((self.size, self.nr_of_bases, self.length)) self.names = [ ] # Names of animal species belonging to the sequences. # For every alignment, do the corresponding coding. for i in range(self.size): sequence = self.alignment[i] self.names.append(sequence.id) # Store name for j in range(self.length): char = sequence.seq[j] try: base = Base[char] self.S[i][base.value, j] = 1 except KeyError: self.S[i][self.nr_of_bases - 1, j] = 1
def test_proteins(self): alpha = HasStopCodon(Gapped(generic_protein, "-"), "*") a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) self.assertEqual(str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) ic = s.information_content(chars_to_ignore=['-', '*']) self.assertAlmostEqual(ic, 133.061475107, places=6)
def split_alignment(clc, alignment, genelimit): """Split a multiple sequence alignment into a dict of sequences""" # genelimit convert: sequences = {} if isinstance(alignment, dict): alignment = MSA(alignment.values()) exp_len = alignment.get_alignment_length() for dt in genelimit: gene, start, end = dt sequences[gene] = alignment[:, start:end] exp_len -= sequences[gene].get_alignment_length() if exp_len != 0: raise ValueError("Could not split alignment, wrong gene delimiter") return sequences
def maskalignment(arg, percent, percentmissing, filetype): maskedcolumn = open( arg.split('.')[0] + '_mask_' + str(percentmissing) + '.txt', 'w+') outFile = open( arg.split('.')[0] + '_masked_' + str(percentmissing) + '.fas', 'w+') checkgap = open(arg.split('.')[0] + '_missingcharacter.txt', 'w+') alignment = AlignIO.read(arg, filetype) trimAlign = MultipleSeqAlignment([]) numRows = len(alignment) x = float(percent) * float(numRows) / 100.0 numGap = numRows - float(x) numCol = alignment.get_alignment_length() print("Total number of rows: %i" % numRows) print("Number of gapped sequences allowed at a given site: %i" % numGap) print("Total number of columns: %i" % numCol) checkgap.write("Total number of rows: \t" + str(numRows) + '\nNumber of gapped sequences allowed at a given site: \t' + str(numGap) + '\n Total number of columns: \t' + str(numCol) + '\n\n cutoff : \t' + str(x) + '\n\n\n') checkgap.write("Position \t Missing Characters \t Characters \n") my_array = {} colToKeep = [] for i in range(numCol): #print i lineName = "line_" + str(i) my_array[lineName] = alignment[:, i] chapre = int(numRows) - int(my_array[lineName].count('-')) checkgap.write( str(i) + '\t' + str(my_array[lineName].count('-')) + '\t' + str(chapre) + '\n') if my_array[lineName].count('-') > numGap: print("get rid of column %i" % i) maskedcolumn.write(str(i) + '\n') else: colToKeep.append(i) for record in alignment: newseq = "" for i in colToKeep: newseq = newseq + (record[i]) newRecord = SeqRecord(Seq(newseq), id=record.id) trimAlign.append(newRecord) outFile.write('>' + record.id + '\n' + newseq + '\n') print("Total number of columns remaining: %i" % trimAlign.get_alignment_length())
def count_mismatches(path, align): """ Calculate the amout of mismatches along a path """ names = [x[0] for x in path] sliced_align = MultipleSeqAlignment( [rec for rec in align if rec.name in names]) i = 0 j = 0 for idx in range(sliced_align.get_alignment_length()): column = sliced_align[:, idx] M = [x for x in column if x != "-"] N = set(M) if len(M) > 1: i += 1 if len(N) > 1: j += 1 return i, j
def remove_invariant_sites(input_align): """ removes invariant sites from an alignment :param input_align: Seq object = fasta of SNPs :return: cleaned fasta """ inv = [ SeqRecord(Seq('', s.seq.alphabet), id=s.id, description=s.description) for s in input_align ] inv = MultipleSeqAlignment(inv) print("input alignment has %i columns" % input_align.get_alignment_length()) for i in range(input_align.get_alignment_length()): if not is_invariant(input_align[:, i]): # add invariant column to alignment alig[:,i:i+1] inv = inv + input_align[:, i:i + 1] print("edited alignment has %i columns" % inv.get_alignment_length()) return inv
def get_validation_label(domain_sid1: str, domain_sid2: str, aligns, pssm_dir: str): pssm1 = parse_pssm(f'{pssm_dir}/{domain_sid1[2:4]}/{domain_sid1}.mtx') pssm2 = parse_pssm(f'{pssm_dir}/{domain_sid2[2:4]}/{domain_sid2}.mtx') msa = MultipleSeqAlignment([ aligns[f'{domain_sid1}&{domain_sid2}'], aligns[f'{domain_sid2}&{domain_sid1}'] ]) assert len(pssm1.pssm) == len(msa[0].seq.ungap('-')) and len( pssm2.pssm) == len(msa[1].seq.ungap('-')) Y = np.zeros((len(pssm1.pssm), len(pssm2.pssm)), dtype=np.int8) x, y = 0, 0 for i in range(msa.get_alignment_length()): if msa[0][i] == "-": y += 1 elif msa[1][i] == "-": x += 1 else: Y[x, y] = 1 x += 1 y += 1 return Y
def make_consensus(path, align): """ Concatenate sequences in path. Differences are resolved by taking the nucleotide in majority within the column. """ # Alignment of sequences in the path path_aln = MultipleSeqAlignment([rec for rec in align if rec.name in path]) # Alignment of sequences NOT in the path no_path_aln = MultipleSeqAlignment( [rec for rec in align if rec.name not in path]) consensus_sequence = "" for idx in range(path_aln.get_alignment_length()): path_col = path_aln[:, idx] ambiguity = set([x for x in path_col if x != "-"]) if len(ambiguity) > 1: no_path_col = [x for x in no_path_aln[:, idx] if x != "-"] if no_path_col: c = collections.Counter(sorted(no_path_col)) fq = sorted( [(x, c[x] / len(no_path_col)) for x in c.keys()], key=lambda x: x[1], reverse=True, ) shared_major_base = set([x[0] for x in fq if x[1] > 0.25 ]).intersection(ambiguity) ambiguity_up = set([x.upper() for x in ambiguity]) # Case 1 one of the two variants is among the dominant bases, # ambiguity is resolved by using the dominant base if shared_major_base and len(shared_major_base) == 1: consensus_sequence += list(shared_major_base)[0] # Case 2 none of the variants are among the dominant bases or # both of them are among the dominant bases, # then return the consensus elif ambiguity_up in [ set(["A", "G"]), set(["C", "T"]), set(["G", "C"]), set(["A", "T"]), set(["G", "T"]), set(["A", "C"]), set(["C", "G", "T"]), set(["A", "G", "T"]), set(["A", "C", "T"]), set(["A", "C", "G"]), set(["A", "T", "C", "G"]), ]: iupac_code = { ("A", "G"): "R", ("C", "T"): "Y", ("G", "C"): "S", ("A", "T"): "W", ("G", "T"): "K", ("A", "C"): "M", ("C", "G", "T"): "B", ("A", "G", "T"): "D", ("A", "C", "T"): "H", ("A", "C", "G"): "V", ("A", "T", "C", "G"): "N", } iupac_ambiguity = [ iupac_code[x] for x in iupac_code.keys() if set(x).intersection(ambiguity_up) == set(x) ][0] consensus_sequence += iupac_ambiguity # Case 3 the alleles already contain ambiguities else: consensus_sequence += "N" else: if [x for x in path_col if x != "-"]: consensus_sequence += [x for x in path_col if x != "-"][0].replace("N", "-") else: consensus_sequence += "-" new_name = path[0].split("|")[0] + "|" + "|".join( [x.split("|")[1] for x in path]) return (new_name, consensus_sequence)
# In[12]: xcount = [str(p.seq).count('X') for p in pralign] xlist = list(set(xcount)) xfreq = [xcount.count(x) for x in xlist] # In[32]: # Exclude sequences trimalign2 = [trimalign[i,:] for i in range(len(pralign)) if xcount[i] <= xthresh] trimalign2 = MultipleSeqAlignment(records=trimalign2,alphabet=Gapped(IUPACAmbiguousDNA(),"-")) #trimalign2 = trimalign # In[46]: # Count gaps, then go in reverse until the threshold is first crossed ta2l = trimalign2.get_alignment_length() ta2n = len(trimalign2) gapcount = [trimalign2[:,i].count("-") for i in range(ta2l)] for i in reversed(range(ta2l)): if ta2n-gapcount[i] >= round(ta2n*gapthresh): break trimalign3 = trimalign2[:,:(i+1)] # In[47]: AlignIO.write(trimalign3,ofn,"fasta")
Given a multiple sequence alignment, calculate the identity for each pair of sequences. """ import argparse from Bio import AlignIO from Bio.Align import MultipleSeqAlignment if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("alignment", help="Multiple sequence alignment input in FASTA format") args = parser.parse_args() # Load the multiple sequence alignment and sort records by name in ascending # order. original_alignment = AlignIO.read(args.alignment, "fasta") alignment = MultipleSeqAlignment(sorted([record for record in original_alignment], key=lambda record: record.name)) for sequence_j in alignment: for sequence_k in alignment: if sequence_j != sequence_k: total = 0 matches = 0 for i in xrange(alignment.get_alignment_length()): total += 1 if sequence_j[i].upper() == sequence_k[i].upper(): matches += 1 print "\t".join((sequence_j.name, sequence_k.name, str(matches / float(total))))
import os # biopython from Bio import Alphabet from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio.Align import AlignInfo from Bio import AlignIO from Bio.SubsMat import FreqTable from Bio.Align import MultipleSeqAlignment # Very simple tests on an empty alignment alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert len(alignment) == 0 del alignment # Basic tests on simple three string alignment alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.append(SeqRecord(Seq(letters), id="mixed")) alignment.append(SeqRecord(Seq(letters.lower()), id="lower")) alignment.append(SeqRecord(Seq(letters.upper()), id="upper")) assert alignment.get_alignment_length() == 26 assert len(alignment) == 3 assert str(alignment[0].seq) == letters assert str(alignment[1].seq) == letters.lower() assert str(alignment[2].seq) == letters.upper() assert alignment[0].id == "mixed"
# Load the multiple sequence alignment and sort records by name in ascending # order. original_alignment = AlignIO.read(args.alignment, "fasta") alignment = MultipleSeqAlignment( sorted([record for record in original_alignment], key=lambda record: record.name)) index_by_column_type = {} current_index = 0 with open(args.classified_alignment_positions, "w") as fh: writer = csv.writer(fh, delimiter="\t", lineterminator="\n") writer.writerow(("position", "column_type", "bases")) for i in xrange(alignment.get_alignment_length()): # First enumerate bases in the given column to determine the column's # "type" (e.g., all bases are the same, all bases are different, etc.). enumerated_bases = enumerate_bases( [alignment[j][i] for j in xrange(len(alignment))]) # Then enumerate this type of column in the context of the alignment # such that each column type gets its own integer that summarizes that # alignment position. if enumerated_bases not in index_by_column_type: index_by_column_type[enumerated_bases] = current_index current_index += 1 if types is None or enumerated_bases in types: writer.writerow((i, index_by_column_type[enumerated_bases], enumerated_bases))
class sequence_set(object): """sequence_set subsamples a set of sequences, aligns them and exports variability statistics""" def __init__(self, fname, reference= None, **kwarks): super(sequence_set, self).__init__() self.nthreads = 2 if os.path.isfile(fname): with myopen(fname) as seq_file: self.raw_seqs = {fix_names(x.description):x for x in SeqIO.parse(seq_file, 'fasta')} for x in self.raw_seqs.values(): x.id = fix_names(x.id) x.name = fix_names(x.id) x.description = fix_names(x.description) if 'run_dir' not in kwarks: import random self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))]) else: self.run_dir = kwarks['run_dir'] if reference is not None: if type(reference) is str and fix_names(reference) in self.raw_seqs: self.reference = self.raw_seqs[fix_names(reference)] else: self.reference = reference else: self.reference=None def parse(self, fields, sep='|', strip='_'): ''' split the sequence description and add annotations to sequences ''' for seq in self.raw_seqs.values(): if not hasattr(seq, "attributes"): seq.attributes = {} words = map(lambda x:x.strip(strip),seq.description.replace(">","").split(sep)) for ii, val in enumerate(words): if ii in fields: if val not in ["", "-"]: seq.attributes[fields[ii]] = val else: seq.attributes[fields[ii]] = "" def ungap(self): ''' remove previously existing gaps and make sure all sequences are upper case ''' for seq in self.raw_seqs.values(): seq.seq = seq.seq.ungap('-').upper() def parse_date(self, fmts, prune=True): if not hasattr(self.raw_seqs.values()[0], "attributes"): print("parse meta info first") return from datetime import datetime for seq in self.raw_seqs.values(): if 'date' in seq.attributes and seq.attributes['date']!='': for fmt in fmts: try: if callable(fmt): tmp = fmt(seq.attributes['date']) else: tmp = datetime.strptime(seq.attributes['date'], fmt).date() seq.attributes['raw_date'] = seq.attributes['date'] seq.attributes['num_date'] = num_date(tmp) seq.attributes['date']=tmp break except: continue if prune: self.raw_seqs = {k:v for k,v in self.raw_seqs.iteritems() if 'date' in v.attributes and type(v.attributes['date'])!=str} def filter(self, func): self.raw_seqs = {key:seq for key, seq in self.raw_seqs.iteritems() if func(seq)} def clock_filter(self, root_seq=None, n_iqd=3, max_gaps = 1.0, plot=False): ''' remove sequences form the set that are that evolve much faster or slower compared the majority. Regions with predominantly gaps can be removed since this can skew the evolutionary rates. ''' from Bio.Align import MultipleSeqAlignment if root_seq is None: # use consensus af = calc_af(self.aln, nuc_alpha) root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)] if type(root_seq)==str and root_seq in self.sequence_lookup: root_seq = np.array(self.sequence_lookup[root_seq]) if max_gaps<1.0: af=calc_af(self.aln, nuc_alpha) good_pos = af[nuc_alpha.index('-')]<max_gaps else: good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool) date_vs_distance = {} for seq in self.aln: date_vs_distance[seq.id] = (seq.attributes['num_date'], np.mean((np.array(seq)!=root_seq)[(np.array(seq)!='-')&(root_seq!='-')&good_pos])) date_vs_distance_array=np.array(date_vs_distance.values()) from scipy.stats import linregress, scoreatpercentile slope, intercept, rval, pval, stderr = linregress(date_vs_distance_array[:,0], date_vs_distance_array[:,1]) print("distance vs time regression:",slope) residuals = (intercept + slope*date_vs_distance_array[:,0]) - date_vs_distance_array[:,1] IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(residuals,25) if plot: import matplotlib.pyplot as plt plt.ion() plt.scatter(date_vs_distance_array[:,0], date_vs_distance_array[:,1], c='g') bad_points = abs(intercept+slope*date_vs_distance_array[:,0] - date_vs_distance_array[:,1])>n_iqd*IQD plt.scatter(date_vs_distance_array[bad_points,0], date_vs_distance_array[bad_points,1], c='r') print("before clock filter:",len(self.aln)) self.aln = MultipleSeqAlignment([seq for seq in self.aln if abs(intercept+slope*date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1])<n_iqd*IQD]) print("after clock filter:",len(self.aln)) def subsample(self, category=None, priority=None, threshold=None, repeated=False): ''' produce a useful set of sequences from the raw input. arguments: category -- callable that assigns each sequence to a category for subsampling priority -- callable that assigns each sequence a priority to be included in the final sample. this is applied independently in each category threshold -- callable that determines the number of sequences from each category that is included in the final set. takes arguments, cat and seq alternatively can be an int ''' if category is None: category = lambda x:(x.attributes['date'].year, x.attributes['date'].month) if priority is None: priority = lambda x:np.random.random() if threshold is None: threshold = lambda x:5 elif type(threshold) is int: print("using threshold:",threshold) tmp = threshold threshold = lambda x:tmp self.sequence_categories = defaultdict(list) if repeated: seqs_to_subsample = self.seqs.values() else: seqs_to_subsample = self.raw_seqs.values() for seq in seqs_to_subsample: seq._priority = priority(seq) self.sequence_categories[category(seq)].append(seq) self.seqs = {} for cat, seqs in self.sequence_categories.iteritems(): seqs.sort(key=lambda x:x._priority, reverse=True) self.seqs.update({seq.id:seq for seq in seqs[:threshold( (cat, seqs) )]}) if self.reference.id not in self.seqs: self.seqs[self.reference.id] = self.reference def align(self): from Bio import AlignIO make_dir(self.run_dir) os.chdir(self.run_dir) SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta") self.aln = AlignIO.read('temp_out.fasta', 'fasta') self.sequence_lookup = {seq.id:seq for seq in self.aln} self.reference_aligned = self.sequence_lookup[self.reference.id] # add attributes to alignment for seqid, seq in self.seqs.iteritems(): self.sequence_lookup[seqid].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir) def codon_align(self, alignment_tool="mafft", prune=True): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO,SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune==False: aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id) else: print(seq.id,"has premature stops, discarding") tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname,'fasta') if alignment_tool=='muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta") elif alignment_tool=='mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:',alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id:seq for seq in self.aln} self.reference_aligned = self.sequence_lookup[self.reference.id] # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir) def strip_non_reference(self): ungapped = np.array(self.reference_aligned)!='-' from Bio.Seq import Seq for seq in self.aln: seq.seq = Seq("".join(np.array(seq)[ungapped])) def diversity_statistics(self): ''' calculate alignment entropy of nucleotide and optionally protein alignments ''' if not hasattr(self, "aln"): print("calculate alignment first") return aln_array = np.array(self.aln) self.af = {'nuc': calc_af(self.aln, nuc_alpha)} tmp_af = self.af['nuc'][:-2]/self.af['nuc'][:-2].sum(axis=0) self.entropy ={'nuc': -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)} if hasattr(self, "translations"): for prot, aln in self.translations.iteritems(): self.af[prot] = calc_af(aln, aa_alpha) tmp_af = self.af[prot][:-2]/self.af[prot][:-2].sum(axis=0) self.entropy[prot] = -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0) def translate(self, proteins=None): from Bio.SeqFeature import FeatureLocation from Bio.Seq import Seq from Bio.Align import MultipleSeqAlignment if not hasattr(self, "proteins"): # generate dictionaries to hold annotation and translation self.translations={} self.proteins={} if proteins is None: # add a default translation of the entire sequence unless otherwise specified self.proteins.update({'cds':FeatureLocation(start=0, end=self.aln.get_alignment_length(), strand=1)}) else: self.proteins.update(proteins) for prot in self.proteins: aa_seqs = [] for seq in self.aln: try: # soon not needed as future biopython version will translate --- into - tmpseq = self.proteins[prot].extract(seq) tmpseq.attributes = seq.attributes tmpseq.seq = Seq(str(Seq(str(tmpseq.seq).replace('---', 'NNN')).translate()).replace('X','-')) except: tmpseq.seq = Seq(str(Seq("".join([x if x in 'ACGT' else 'N' for x in str(tmpseq.seq)])).translate()).replace('X','-')) print("Trouble translating",seq.id) #import ipdb; ipdb.set_trace() aa_seqs.append(tmpseq) self.translations[prot] = MultipleSeqAlignment(aa_seqs) def export_diversity(self, fname = 'entropy.json'): if not hasattr(self, "entropy"): self.diversity_statistics() entropy_json = {} for feat in self.entropy: S = [max(0,round(x,4)) for x in self.entropy[feat]] n = len(S) if feat=='nuc': entropy_json[feat] = {'pos':range(0,n), 'codon':[x//3 for x in range(0,n)], 'val':S} else: entropy_json[feat] = {'pos':[x for x in self.proteins[feat]][::3], 'codon':[(x-self.proteins[feat].start)//3 for x in self.proteins[feat]][::3], 'val':S} write_json(entropy_json, fname, indent=None)
def test_empty_alignment(self): """Very simple tests on an empty alignment.""" alignment = MultipleSeqAlignment([]) self.assertEqual(alignment.get_alignment_length(), 0) self.assertEqual(len(alignment), 0)
site.count('N') + site.count('n') + site.count('-')) # count gaps and missing data pcGap_s = nGap_s / nSeq * 100 if pcGap_s > maxpcGap_s: delsites.append(i) # if proportion of seqs in the column with missing data is above threshold, delete column aln_a = np.delete(aln_a, delsites, 1) c = 0 for current_seq in aln: filt1_aln.add_sequence(current_seq.id, ''.join(map(str, list(aln_a[c, ])))) c += 1 length = filt1_aln.get_alignment_length( ) # if length of alignment after column-wise filter is above threshold, continue if length >= minLen: filt2_aln = MultipleSeqAlignment([]) for current_seq in filt1_aln: # for each sequence seq = str(current_seq.seq) nGap = float(seq.count('N') + seq.count('n') + seq.count('-')) pcGap = nGap / length * 100 if pcGap < maxpcGap: # if proportion of missing data in seq is below threshold, print to filtered alignment filt2_aln.add_sequence(current_seq.id, str(current_seq.seq)) filt3_aln = MultipleSeqAlignment([]) for current_seq in filt2_aln: # for each sequence seq = str(current_seq.seq) nHet = float( seq.count('W') + seq.count('w') + seq.count('S') + seq.count('s') + seq.count('M') + seq.count('m') +
given_alpha).next() assert False, "Forcing wrong alphabet, %s, should fail (%s)" \ % (repr(given_alpha), t_filename) except ValueError: pass del good, bad, given_alpha, base_alpha if t_alignment: print "Testing reading %s format file %s as an alignment" \ % (t_format, t_filename) alignment = MultipleSeqAlignment(SeqIO.parse( \ handle=open(t_filename,mode), format=t_format)) assert len(alignment) == t_count alignment_len = alignment.get_alignment_length() #Check the record order agrees, and double check the #sequence lengths all agree too. for i in range(t_count): assert compare_record(records[i], alignment[i]) assert len(records[i].seq) == alignment_len print alignment_summary(alignment) #Some alignment file formats have magic characters which mean #use the letter in this position in the first sequence. #They should all have been converted by the parser, but if #not reversing the record order might expose an error. Maybe. records.reverse() check_simple_write_read(records)
class DisplayedAlignment(object): """ Provides tools for displaying and manipulating an alignment and storing all previous versions """ def __init__(self, alignment): self.displayedColumn = 0 self.alignment = alignment self.alignmentHistory = [alignment[:, :]] self.changed = False self.translated = False self.translationTable = 1 def ParseIndex(self, text): """ Parses a text string specifying a range of rows (taxa) and columns. Expects the text to be in the format used to specify a range from a Bio.Align.MultipleSeqAlignment. Returns indices for the start and stop taxon and the start and stop columns """ taxonStart = 0 taxonStop = len(self.alignment) - 1 columnStart = 0 columnStop = self.alignment.get_alignment_length() - 1 if (',' not in text): self.AlertMessage( 'Invalid index format. (taxa or columns missing)', 'high') return (-1, -1, -1, -1) else: text = text.strip() indices = text.split(',') if (len(indices) > 2): self.AlertMessage('Invalid index format. (too many fields)', 'high') return (-1, -1, -1, -1) else: if (':' in indices[0] ): #there is a range specified in the taxon index taxonIndices = indices[0].split(':') if (taxonIndices[0]): #a start taxon is specified try: taxonStart = int(taxonIndices[0].strip()) except: self.AlertMessage( 'Invalid index format. (taxon start index not an integer)', 'high') return (-1, -1, -1, -1) if (taxonIndices[1]): #a stop taxon is specified try: taxonStop = int(taxonIndices[1].strip()) except: self.AlertMessage( 'Invalid index format. (taxon stop index not an integer)', 'high') return (-1, -1, -1, -1) elif (indices[0]): #a single taxon is specified try: taxonStart = int(indices[0].strip()) taxonStop = int(indices[0].strip()) except: self.AlertMessage( 'Invalid index format. (taxon start or stop index not an integer)', 'high') return (-1, -1, -1, -1) if (':' in indices[1] ): #there is a range specified in the taxon index columnIndices = indices[1].split(':') if (columnIndices[0]): #a start taxon is specified try: columnStart = int(columnIndices[0].strip()) except: self.AlertMessage( 'Invalid index format. (column start index not an integer)', 'high') return (-1, -1, -1, -1) if (columnIndices[1]): #a stop taxon is specified try: columnStop = int(columnIndices[1].strip()) except: self.AlertMessage( 'Invalid index format. (column stop index not an integer)', 'high') return (-1, -1, -1, -1) elif (indices[1]): #a single taxon is specified try: columnStart = int(indices[1].strip()) columnStop = int(indices[1].strip()) except: self.AlertMessage( 'Invalid index format. (column start or stop index not an integer)', 'high') return (-1, -1, -1, -1) if ((0 <= taxonStart <= taxonStop) & (0 <= columnStart <= columnStop)): return (taxonStart, taxonStop, columnStart, columnStop) else: self.AlertMessage( 'Invalid index range. (start > stop or index < 0)', 'high') return (-1, -1, -1, -1) def ColorizeDNA(self, text): """ Colorizes output based on nucleotide """ if (text == 'A'): escape = '\033[92m' # Green elif (text == 'G'): escape = '\033[93m' # Yellow elif (text == 'T'): escape = '\033[91m' # Red elif (text == 'C'): escape = '\033[96m' # Blue else: return text return escape + text + '\033[0m' def ColorizeAA(self, text): """ Colorize output based on amino acid polarity or nonpolarity """ if (text in ['A', 'F', 'H', 'I', 'K', 'L', 'M', 'P', 'R', 'V', 'W']): escape = '\033[91m' # Red elif (text in ['C', 'G', 'N', 'Q', 'S', 'T', 'Y', 'B', 'Z']): escape = '\033[96m' # Blue elif (text in ['D', 'E']): escape = '\033[92m' # Green elif (text in ['X', '*']): escape = '\033[93m' # Yellow else: return text return escape + text + '\033[0m' def AlertMessage(self, text, severity='low'): """ Display an alert message with a tag and color corresponding to the severity of the alert ('low', 'medium', 'high') """ if (severity == 'high'): escape = '\033[91m' # Red tag = '!!!' elif (severity == 'medium'): escape = '\033[93m' # Yellow tag = '***' else: escape = '\033[92m' # Green tag = ' ' print escape + tag, text, tag + '\033[0m' def Show(self, column=0): """ Displays 100 columns of the alignment beginning at 'column' """ if column < 0: column = 0 row = 0 marker = '| : ' * 10 spacer = ' ' * 15 markerRow = spacer + marker if (self.translated == False): indexRow = spacer for index in range(column, column + 100, 10): indexRow = indexRow + str(index).ljust(10) print indexRow print markerRow for sequence in self.alignment[:, column:column + 100]: print '%2d) %10s' % (row, sequence.id), dnaSequence = '' for nucleotide in str(sequence.seq): dnaSequence += self.ColorizeDNA(nucleotide) print dnaSequence, if (column + 100 < self.alignment.get_alignment_length()): print '...' else: print row += 1 print markerRow print indexRow else: indexRow = spacer for index in range(column / 3, (column / 3) + 100, 10): indexRow = indexRow + str(index).ljust(10) print indexRow print markerRow for sequence in self.alignment[:, column:column + 300]: proteinSequence = '' for codonPosition in range(0, len(sequence), 3): codon = sequence.seq[codonPosition:codonPosition + 3] if (str(codon) == '---'): proteinSequence += '-' elif ('-' in codon): proteinSequence += '?' else: proteinSequence += self.ColorizeAA( str(codon.translate(table=self.translationTable))) print '%2d) %10s %s' % (row, sequence.id, proteinSequence), if (column + 300 < self.alignment.get_alignment_length()): print '...' else: print row += 1 print markerRow print indexRow self.displayedColumn = column def BackupAlignment(self): """ Stores the current alignment state to the alignment change history """ self.alignmentHistory.append(self.alignment[:, :]) def UndoChanges(self): """ Reverts to the previous state in the alignment change history. Does not effect which column index is displayed or whether the sequence is displayed as translated or not since those are not changes to the alignment. """ if (len(self.alignmentHistory) > 1): self.alignmentHistory.pop() self.alignment = self.alignmentHistory[-1][:, :] self.Show(self.displayedColumn) else: self.AlertMessage('Nothing to undo.', 'low') def DeleteRange(self, rangeText, silent=False): """ Removes a row and column range from the alignment """ startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex( rangeText) if (self.translated == True): startColumn = startColumn * 3 stopColumn = (stopColumn * 3) + 2 if (startTaxon >= 0): #Make sure we had a valid range changeLength = 0 deleteTaxon = False if ((startColumn == 0) & (stopColumn == len(self.alignment[0]) - 1)): deleteTaxon = True if ((startTaxon > 0) | (stopTaxon < len(self.alignment) - 1)): changeLength = (stopColumn - startColumn) + 1 taxon = 0 newSequences = [] for Sequence in self.alignment: if (taxon in range(startTaxon, stopTaxon + 1)): if (not deleteTaxon): if (startColumn > 0): Sequence.seq = Sequence.seq[: startColumn] + Sequence.seq[ stopColumn + 1:] else: Sequence.seq = Sequence.seq[stopColumn + 1:] if (changeLength): Sequence.seq = Sequence.seq + Seq( '-' * changeLength) newSequences.append(Sequence) else: newSequences.append(Sequence) taxon += 1 self.alignment = MultipleSeqAlignment(newSequences) if (not silent): self.Show(self.displayedColumn) self.BackupAlignment() def ModifyRange(self, rangeText, nucleotide='-'): """ Changes the nucleotides in a row and column range to a specified nucleotide. Has no effect when the alignment is translated since the corresponding change to the underlying nucleotide alignment would be ambiguous at best. """ nucleotide = nucleotide.upper() if (self.translated == True): self.AlertMessage("Can't modify protein sequences.", 'medium') elif (nucleotide not in [ 'A', 'G', 'C', 'T', 'R', 'K', 'S', 'W', 'M', 'Y', 'D', 'V', 'B', 'H', 'N', '-' ]): self.AlertMessage( 'Invalid nucleotide. (only AGTC- and IUB nucleotide codes are permitted)', 'high') else: startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex( rangeText) if (startTaxon >= 0): #Make sure we have a valid range taxon = 0 newSequences = [] modificationLength = (stopColumn - startColumn) + 1 for Sequence in self.alignment: if (taxon in range(startTaxon, stopTaxon + 1)): if (startColumn > 0): Sequence.seq = Sequence.seq[:startColumn] + Seq( nucleotide * modificationLength) + Sequence.seq[stopColumn + 1:] else: Sequence.seq = Seq( nucleotide * modificationLength) + Sequence.seq[stopColumn + 1:] newSequences.append(Sequence) taxon += 1 self.alignment = MultipleSeqAlignment(newSequences) self.Show(self.displayedColumn) self.BackupAlignment() def InsertRange(self, rangeText): """ Inserts a row and column range into the alignment and fills it with gaps ('-' for nucleotides or '---' for translated alignments) """ startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex( rangeText) if (self.translated == True): startColumn = startColumn * 3 stopColumn = (stopColumn * 3) + 2 if (startTaxon >= 0): #Make sure we had a valid range changeLength = (stopColumn - startColumn) + 1 taxon = 0 newSequences = [] for Sequence in self.alignment: if (taxon in range(startTaxon, stopTaxon + 1)): if (startColumn > 0): Sequence.seq = Sequence.seq[:startColumn] + Seq( '-' * changeLength) + Sequence.seq[startColumn:] else: Sequence.seq = Seq( '-' * changeLength) + Sequence.seq[:] else: Sequence.seq = Sequence.seq + Seq('-' * changeLength) newSequences.append(Sequence) taxon += 1 self.alignment = MultipleSeqAlignment(newSequences) self.Show(self.displayedColumn) self.BackupAlignment() def Jump(self, column): """ Moves the displayed column to a specified column index """ if (self.translated == True): column = column * 3 self.Show(column) def ScrollRight(self, offset=100): """ Scroll the display 'offset' columns to the right """ if (self.translated == True): offset = offset * 3 self.Show(self.displayedColumn + offset) def ScrollLeft(self, offset=100): """ Scroll the display 'offset' columns to the left """ if (self.translated == True): offset = offset * 3 self.Show(self.displayedColumn - offset) def Reverse(self): """ Reverses the order of the columns in the alignment. Has no effect on translated sequences. """ if (self.translated == False): self.alignment = self.alignment[:, ::-1] self.Show(self.displayedColumn) self.BackupAlignment() else: self.AlertMessage("Can't reverse protein sequences.", 'medium') def Complement(self): """ Give the complement of the alignment. Has no effect on translated sequences. """ if (self.translated == False): for i in range(len(self.alignment)): self.alignment[i].seq = self.alignment[i].seq.complement() self.Show(self.displayedColumn) self.BackupAlignment else: self.AlertMessage("Can't complement protein sequences.", 'medium') def ReverseComplement(self): """ Reverse and complement the alignment. Has no effect on translated sequences. """ if (self.translated == False): for i in range(len(self.alignment)): self.alignment[i].seq = self.alignment[ i].seq.reverse_complement() self.Show(self.displayedColumn) self.BackupAlignment() else: self.AlertMessage("Can't reverse-complement protein sequences.", 'medium') def Translate(self, translationTable=11): """ Switch to displaying and manipulating the sequence as a protein sequence. Still works on translated sequences if a different translation table is specified, otherwise it backtranslated translated sequences. """ if ((self.translated == False) | ((self.translated == True) & (self.translationTable != translationTable))): self.translated = True self.translationTable = translationTable self.displayedColumn = self.displayedColumn - ( self.displayedColumn % 3) self.Show(self.displayedColumn) else: self.BackTranslate() def BackTranslate(self): """ Revert to displaying and manipulating the sequence as a dna sequence. Has no effect if the sequence is already dna. """ if (self.translated == True): self.translated = False self.Show(self.displayedColumn) else: self.AlertMessage( "Can't back-translate. Alignment contains DNA sequences", 'medium') def Save(self, fileName='alignment.phy', alignmentFormat='phylip'): """ Write alignment to disk """ AlignIO.write(self.alignment, fileName, alignmentFormat) self.AlertMessage( 'Saved alignment to ' + fileName + ' in ' + alignmentFormat + ' format.', 'low') def CleanUp(self): """ Condense the alignment by removing any columns that contain spaces in all taxa. """ blankColumnPattern = re.compile('^-*$') blankColumns = [] for columnIndex in range(self.alignment.get_alignment_length() - 1): columnValues = self.alignment[:, columnIndex] match = blankColumnPattern.search(columnValues) if (match): blankColumns.append(str(columnIndex)) for column in blankColumns[::-1]: self.DeleteRange(',' + str(column), True) self.Show(self.displayedColumn) self.BackupAlignment()
# standard library import os # biopython from Bio import Alphabet from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio.Align import AlignInfo from Bio import AlignIO from Bio.SubsMat import FreqTable from Bio.Align import MultipleSeqAlignment #Very simple tests on an empty alignment alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert len(alignment) == 0 del alignment #Basic tests on simple three string alignment alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.append(SeqRecord(Seq(letters), id="mixed")) alignment.append(SeqRecord(Seq(letters.lower()), id="lower")) alignment.append(SeqRecord(Seq(letters.upper()), id="upper")) assert alignment.get_alignment_length() == 26 assert len(alignment) == 3 assert str(alignment[0].seq) == letters assert str(alignment[1].seq) == letters.lower() assert str(alignment[2].seq) == letters.upper() assert alignment[0].id == "mixed"
class sequence_set(object): def __init__(self, logger, sequences, reference, dateFormat): super(sequence_set, self).__init__() self.log = logger # load sequences from the (parsed) JSON - don't forget to sort out dates self.seqs = {} for name, data in sequences.items(): self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna), id=name, name=name, description=name) self.seqs[name].attributes = data["attributes"] # tidy up dates date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat) self.seqs[name].attributes["num_date"] = date_struc[1] self.seqs[name].attributes["date"] = date_struc[2] # if the reference is to be analysed it'll already be in the (filtered & subsampled) # sequences, so no need to add it here, and no need to care about attributes etc # we do, however, need it for alignment self.reference_in_dataset = reference["included"] name = reference["strain"] self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna), id=name, name=name, description=name) if "genes" in reference and len(reference["genes"]): self.proteins = {} for k, v in reference["genes"].items(): feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) # Translate sequences to identify any proteins ending with a stop codon. translation = Seq.translate(Seq(feature.extract(str(self.reference_seq.seq)))) if translation.endswith("*"): # Truncate the last codon of the protein to omit the stop codon. feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"]) self.proteins[k] = feature else: self.proteins = None # other things: self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))]) self.nthreads = 2 # should load from config file def convert_trait_to_numerical_date(self, trait, dateFormat): for name, seq in self.seqs.items(): try: date_struc = parse_date(seq.attributes[trait], dateFormat) seq.attributes[trait] = date_struc[1] except KeyError: self.log.warn("Attribute {} not found for sequence {}. Ignoring".format(trait, seq.name)) def codon_align(self): self.log.fatal("Codon align not yet implemented") def align(self, verbose, debug=False): ''' align sequences using mafft side-effects: self.aln {MultipleSeqAlignment} reference not present if not in self.seqs self.reference_aln {SeqRecord} always set, even if the reference is subsequently discarded self.sequence_lookup {dict} map linking seq.id to the alignment, potentialy without the reference saves the alignment (always including reference) to fname ''' make_dir(self.run_dir) os.chdir(self.run_dir) if self.reference_in_dataset: out_seqs = self.seqs.values() else: self.log.notify("Adding reference for alignment step") out_seqs = list(self.seqs.values()) + [self.reference_seq] SeqIO.write(out_seqs, "temp_in.fasta", "fasta") self.log.notify("Running alignment") if verbose == 0: os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr") else: os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta") self.aln = AlignIO.read('temp_out.fasta', 'fasta') os.chdir("..") if not debug: remove_dir(self.run_dir) self.set_reference_alignment() #make reference_aln object while reference still in alignment self.set_sequence_lookup() self.add_attributes_to_aln() def remove_reference_from_alignment(self): count = len(self.aln) self.aln = MultipleSeqAlignment([s for s in self.aln if s.name!=self.reference_seq.name]) assert(count == (len(self.aln)+1)) def set_reference_alignment(self): self.reference_aln = [x for x in list(self.aln) if x.name==self.reference_seq.name][0] def set_sequence_lookup(self): self.sequence_lookup = {seq.id:seq for seq in self.aln} def add_attributes_to_aln(self): for seqid, seq in self.seqs.items(): self.sequence_lookup[seqid].attributes = seq.attributes def try_restore_align_from_disk(self, fname): try: self.aln = AlignIO.read(fname, "fasta") except IOError: return except Exception as e: self.log.notify("Error restoring from alignment... re-doing") print(e) return try: self.set_reference_alignment() except IndexError: self.log.notify("Reference not found in alignment... ok on reload") # del self.aln # return if not self.reference_in_dataset: self.remove_reference_from_alignment() if len({x.id for x in self.aln} ^ set(self.seqs.keys())) != 0: self.log.notify("Alignment on disk had different sequences... re-doing") del self.aln del self.reference_aln return # at this stage we are happy with the alignment self.set_sequence_lookup() self.add_attributes_to_aln() self.log.notify("Alignment restored from disk") def strip_non_reference(self): ''' remove insertions relative to the reference from the alignment ''' ungapped = np.array(self.reference_aln)!='-' for seq in self.aln: seq.seq = Seq("".join(np.array(seq)[ungapped])) def make_gaps_ambiguous(self): ''' replace all gaps by 'N' in all sequences in the alignment. TreeTime will treat them as fully ambiguous and replace then with the most likely state ''' for seq in self.aln: seq_array = np.array(seq) gaps = seq_array=='-' seq_array[gaps]='N' seq.seq = Seq("".join(seq_array)) def make_terminal_gaps_ambiguous(self): ''' replace all gaps at the end of sequences by 'N' in the alignment. TreeTime will treat them as fully ambiguous and replace then with the most likely state ''' for seq in self.aln: str_seq = str(seq.seq) lgaps = len(str_seq) - len(str_seq.lstrip('-')) rgaps = len(str_seq) - len(str_seq.rstrip('-')) str_seq = 'N'*lgaps + str_seq.strip('-') + 'N'*rgaps seq.seq = Seq(str_seq) def translate(self): ''' make alignments of translations. ''' from Bio.Seq import CodonTable codon_table = CodonTable.ambiguous_dna_by_name['Standard'].forward_table self.translations={} if not hasattr(self, "proteins"): # ensure dictionary to hold annotation self.proteins={} # add a default translation of the entire sequence unless otherwise specified if len(self.proteins)==0: self.proteins.update({'cds':FeatureLocation(start=0, end=self.aln.get_alignment_length(), strand=1)}) # loop over all proteins and create one MSA for each for prot in self.proteins: aa_seqs = [] for seq in self.aln: tmpseq = self.proteins[prot].extract(seq) translated_seq, translation_exception = safe_translate(str(tmpseq.seq), report_exceptions=True) if translation_exception: self.log.notify("Trouble translating because of invalid codons %s" % seq.id) tmpseq.seq = Seq(translated_seq) # copy attributes tmpseq.attributes = seq.attributes aa_seqs.append(tmpseq) self.translations[prot] = MultipleSeqAlignment(aa_seqs) def clock_filter(self, root_seq=None, n_iqd=3, max_gaps = 1.0, plot=False): ''' remove sequences form the set that are that evolve much faster or slower compared the majority. Regions with predominantly gaps can be removed since this can skew the evolutionary rates. ''' if root_seq is None: # use consensus af = calc_af(self.aln, nuc_alpha) root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)] if type(root_seq)==str and root_seq in self.sequence_lookup: root_seq = np.array(self.sequence_lookup[root_seq]) if max_gaps<1.0: af=calc_af(self.aln, nuc_alpha) good_pos = af[nuc_alpha.index('-')]<max_gaps else: good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool) date_vs_distance = {} # self.reference_aln = None already set at alignment step for seq in self.aln: date_vs_distance[seq.id] = (seq.attributes['num_date'], np.mean((np.array(seq)!=root_seq)[(np.array(seq)!='-')&(root_seq!='-')&good_pos])) # if seq.id==self.reference.id: # self.reference_aln = seq date_vs_distance_array=np.array(date_vs_distance.values()) from scipy.stats import linregress, scoreatpercentile slope, intercept, rval, pval, stderr = linregress(date_vs_distance_array[:,0], date_vs_distance_array[:,1]) print("distance vs time regression:",slope) residuals = (intercept + slope*date_vs_distance_array[:,0]) - date_vs_distance_array[:,1] IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(residuals,25) if plot: import matplotlib.pyplot as plt plt.ion() plt.scatter(date_vs_distance_array[:,0], date_vs_distance_array[:,1], c='g') bad_points = abs(intercept+slope*date_vs_distance_array[:,0] - date_vs_distance_array[:,1])>n_iqd*IQD plt.scatter(date_vs_distance_array[bad_points,0], date_vs_distance_array[bad_points,1], c='r') print("before clock filter:",len(self.aln)) tmp = {seq.id:seq for seq in self.aln if abs(intercept+slope*date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1])<n_iqd*IQD} if self.reference.id not in tmp and self.reference.reference_in_dataset: self.log.notify('adding reference again after clock filter') tmp[self.reference.id] = self.reference_aln self.aln = MultipleSeqAlignment(tmp.values()) print("after clock filter:",len(self.aln)) def diversity_statistics(self): ''' calculate alignment entropy of nucleotide and optionally protein alignments ''' if not hasattr(self, "aln"): self.log.fatal("Diversity statistics calculated before alignment generated.") return aln_array = np.array(self.aln) self.af = {'nuc': calc_af(self.aln, nuc_alpha)} tmp_af = self.af['nuc'][:-2]/self.af['nuc'][:-2].sum(axis=0) self.entropy ={'nuc': -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)} if hasattr(self, "translations"): for prot, aln in self.translations.items(): self.af[prot] = calc_af(aln, aa_alpha) tmp_af = self.af[prot][:-2]/self.af[prot][:-2].sum(axis=0) self.entropy[prot] = -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0) def export_diversity(self, fname = 'entropy.json', indent=None): ''' write the alignment entropy of each alignment (nucleotide and translations) to file ''' if not hasattr(self, "entropy"): self.diversity_statistics() entropy_json = {} for feat in self.entropy: S = [max(0,round(x,4)) for x in self.entropy[feat]] n = len(S) if feat=='nuc': entropy_json[feat] = {'pos':list(range(0,n)), 'codon':[x//3 for x in range(0,n)], 'val':S} else: entropy_json[feat] = {'pos':[x for x in self.proteins[feat]][::3], 'codon':[(x-self.proteins[feat].start)//3 for x in self.proteins[feat]][::3], 'val':S} write_json(entropy_json, fname, indent=indent)
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration # Whitelisted headers we know about. known_headers = [ "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp" ] # Examples in "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 # would often start as follows: # # !!AA_MUTIPLE_ALIGNMENT 1.0 # PileUp of: @/usr/users2/culhane/... # # etc with other seemingly free format text before getting to the # MSF/Type/Check line and the following Name: lines block and // line. # # MUSCLE just has a line "PileUp", while other sources just use the line # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" # (nucleotide). if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known GCG MSF header: %s" % (line.strip().split()[0], ", ".join(known_headers))) while line and " MSF: " not in line: line = handle.readline() if not line: raise ValueError( "Reached end of file without MSF/Type/Check header line") # Quoting from "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. # Page 31: # # "Header information is before a .. (double dot) in a GCG format file. # The file will also have a checksum specific for that file." # # This was followed by a single non-aligned sequence, but this convention # appears to also be used in the GCG MSF files. Quoting other examples in # this reference, page 31: # # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. # # Except from page 148: # # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. # # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: # # MSF: 689 Type: N Check: 0000 .. # # By observation, the MSF value is the column count, type is N (nucleotide) # or P (protein / amino acid). # # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, # # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf # !!NA_MULTIPLE_ALIGNMENT 1.0 # # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. # # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 # Name: G29385 Len: 633 Check: 391 Weight: 1.00 # # // # parts = line.strip("\n").split() offset = parts.index("MSF:") if (parts[offset + 2] != "Type:" or parts[-3] not in ("Check:", "CompCheck:") or parts[-1] != ".."): raise ValueError( "GCG MSF header line should be " "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', " " not: %r" % line) try: aln_length = int(parts[offset + 1]) except ValueError: aln_length = -1 if aln_length < 0: raise ValueError( "GCG MSF header line should have MDF: <int> for column count, not %r" % parts[offset + 1]) seq_type = parts[offset + 3] if seq_type not in ["P", "N"]: raise ValueError( "GCG MSF header line should have 'Type: P' (protein) " "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type) # There should be a blank line after that header line, then the Name: lines # # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here, # # PileUp # # # # MSF: 628 Type: P Check: 147 .. # # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 # # // ids = [] lengths = [] checks = [] weights = [] line = handle.readline() while line and line.strip() != "//": line = handle.readline() if line.strip().startswith("Name: "): if " Len: " in line and " Check: " in line and " Weight: " in line: rest = line[line.index("Name: ") + 6:].strip() name, rest = rest.split(" Len: ") length, rest = rest.split(" Check: ") check, weight = rest.split(" Weight: ") name = name.strip() if name.endswith(" oo"): # T-COFFEE oddity, ignore this name = name[:-3] if name in ids: raise ValueError("Duplicated ID of %r" % name) if " " in name: raise NotImplementedError("Space in ID %r" % name) ids.append(name) # Expect aln_length <= int(length.strip()), see below lengths.append(int(length.strip())) checks.append(int(check.strip())) weights.append(float(weight.strip())) else: raise ValueError("Malformed GCG MSF name line: %r" % line) if not line: raise ValueError( "End of file while looking for end of header // line.") if aln_length != max(lengths): # In broken examples from IMGTHLA was possible to continue # https://github.com/ANHIG/IMGTHLA/issues/201 max_length = max(lengths) max_count = sum(1 for _ in lengths if _ == max_length) raise ValueError( "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s" % (aln_length, max_count, len(ids), max_length)) line = handle.readline() if not line: raise ValueError("End of file after // line, expected sequences.") if line.strip(): raise ValueError( "After // line, expected blank line before sequences.") # Now load the sequences seqs = [[] for _ in ids] # list of empty lists completed_length = 0 while completed_length < aln_length: # Note might have a coordinate header line (seems to be optional) for idx, name in enumerate(ids): line = handle.readline() if idx == 0 and not line.strip(): # T-COFFEE uses two blank lines between blocks, rather than one while line and not line.strip(): line = handle.readline() if not line: raise ValueError( "End of file where expecting sequence data.") # print("Looking for seq for %s in line: %r" % (name, line)) words = line.strip().split() # Should we use column numbers, rather than assuming no spaces in names? if idx == 0 and words and words[0] != name: # print("Actually have a coord line") # Hopefully this is a coordinate header before the first seq try: i = int(words[0]) except ValueError: i = -1 if i != completed_length + 1: raise ValueError( "Expected GCG MSF coordinate line starting %i, got: %r" % (completed_length + 1, line)) if len(words) > 1: # Final block usually not full 50 chars, so expect start only. if len(words) != 2: i = -1 else: try: i = int(words[1]) except ValueError: i = -1 if i != (completed_length + 50 if completed_length + 50 < aln_length else aln_length): raise ValueError( "Expected GCG MSF coordinate line %i to %i, got: %r" % ( completed_length + 1, completed_length + 50 if completed_length + 50 < aln_length else aln_length, line, )) line = handle.readline() words = line.strip().split() # print("Still looking for seq for %s in line: %r" % (name, line)) # Dealt with any coordinate header line, should now be sequence if not words: # Should be sequence here, but perhaps its a short one? if (lengths[idx] < aln_length and len("".join(seqs[idx])) == lengths[idx]): # Is this actually allowed in the format? Personally I would # expect a line with name and a block of trailing ~ here. pass else: raise ValueError("Expected sequence for %s, got: %r" % (name, line)) elif words[0] == name: assert len(words) > 1, line # print(i, name, repr(words)) seqs[idx].extend(words[1:]) else: raise ValueError("Expected sequence for %r, got: %r" % (name, line)) # TODO - check the sequence lengths thus far are consistent # with blocks of 50? completed_length += 50 line = handle.readline() if line.strip(): raise ValueError("Expected blank line, got: %r" % line) # Skip over any whitespace at the end... while True: line = handle.readline() if not line: # End of file, no more alignments break elif not line.strip(): # Blank line, ignore pass elif line.strip().split()[0] in known_headers: # Looks like the start of another alignment: self._header = line break else: raise ValueError( "Unexpected line after GCG MSF alignment: %r" % line) # Combine list of strings into single string, remap gaps seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs] # Apply any trailing padding for short sequences padded = False for idx, (length, s) in enumerate(zip(lengths, seqs)): if len(s) < aln_length and len(s) == length: padded = True seqs[idx] = s + "-" * (aln_length - len(s)) if padded: import warnings from Bio import BiopythonParserWarning warnings.warn( "One of more alignment sequences were truncated and have been gap padded", BiopythonParserWarning, ) records = (SeqRecord( Seq(s), id=i, name=i, description=i, annotations={"weight": w}, ) for (i, s, w) in zip(ids, seqs, weights)) # This will check alignment lengths are self-consistent: align = MultipleSeqAlignment(records) # Check matches the header: if align.get_alignment_length() != aln_length: raise ValueError( "GCG MSF headers said alignment length %i, but have %i" % (aln_length, align.get_alignment_length())) return align
class sequence_set(object): def __init__(self, logger, sequences, reference, dateFormat): super(sequence_set, self).__init__() self.log = logger # load sequences from the (parsed) JSON - don't forget to sort out dates self.seqs = {} for name, data in sequences.iteritems(): self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna), id=name, name=name, description=name) self.seqs[name].attributes = data["attributes"] # tidy up dates date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat) self.seqs[name].attributes["num_date"] = date_struc[1] self.seqs[name].attributes["date"] = date_struc[2] # if the reference is to be analysed it'll already be in the (filtered & subsampled) # sequences, so no need to add it here, and no need to care about attributes etc # we do, however, need it for alignment self.reference_in_dataset = reference["included"] name = reference["strain"] self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna), id=name, name=name, description=name) if "genes" in reference and len(reference["genes"]): self.proteins = { k: FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) for k, v in reference["genes"].iteritems() } else: self.proteins = None # other things: self.run_dir = '_'.join([ 'temp', time.strftime('%Y%m%d-%H%M%S', time.gmtime()), str(random.randint(0, 1000000)) ]) self.nthreads = 2 # should load from config file def convert_trait_to_numerical_date(self, trait, dateFormat): for name, seq in self.seqs.iteritems(): try: date_struc = parse_date(seq.attributes[trait], dateFormat) seq.attributes[trait] = date_struc[1] except KeyError: self.log.warn( "Attribute {} not found for sequence {}. Ignoring".format( trait, seq.name)) def codon_align(self): self.log.fatal("Codon align not yet implemented") def align(self, fname, debug=False): ''' align sequences using mafft side-effects: self.aln {MultipleSeqAlignment} reference not present if not in self.seqs self.reference_aln {SeqRecord} always set, even if the reference is subsequently discarded self.sequence_lookup {dict} map linking seq.id to the alignment, potentialy without the reference saves the alignment (always including reference) to fname ''' make_dir(self.run_dir) os.chdir(self.run_dir) if self.reference_in_dataset: out_seqs = self.seqs.values() else: self.log.notify("Adding reference for alignment step") out_seqs = self.seqs.values() + [self.reference_seq] SeqIO.write(out_seqs, "temp_in.fasta", "fasta") self.log.notify("Running alignment") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr") self.aln = AlignIO.read('temp_out.fasta', 'fasta') os.chdir("..") os.rename(os.path.join(self.run_dir, "temp_out.fasta"), fname) if not debug: remove_dir(self.run_dir) self.set_reference_alignment() if not self.reference_in_dataset: self.remove_reference_from_alignment() self.set_sequence_lookup() self.add_attributes_to_aln() def remove_reference_from_alignment(self): count = len(self.aln) self.aln = MultipleSeqAlignment( [s for s in self.aln if s.name != self.reference_seq.name]) assert (count == (len(self.aln) + 1)) def set_reference_alignment(self): self.reference_aln = [ x for x in list(self.aln) if x.name == self.reference_seq.name ][0] def set_sequence_lookup(self): self.sequence_lookup = {seq.id: seq for seq in self.aln} def add_attributes_to_aln(self): for seqid, seq in self.seqs.iteritems(): self.sequence_lookup[seqid].attributes = seq.attributes def try_restore_align_from_disk(self, fname): try: self.aln = AlignIO.read(fname, "fasta") except IOError: return except Exception as e: self.log.notify("Error restoring from alignment... re-doing") print(e) return try: self.set_reference_alignment() except IndexError: self.log.notify("Reference not found in alignment... re-doing") del self.aln return if not self.reference_in_dataset: self.remove_reference_from_alignment() if len({x.id for x in self.aln} ^ set(self.seqs.keys())) != 0: self.log.notify( "Alignment on disk had different sequnces... re-doing") del self.aln del self.reference_aln return # at this stage we are happy with the alignment self.set_sequence_lookup() self.add_attributes_to_aln() self.log.notify("Alignment restored from disk") def strip_non_reference(self): ungapped = np.array(self.reference_aln) != '-' for seq in self.aln: seq.seq = Seq("".join(np.array(seq)[ungapped])) def remove_terminal_gaps(self): for seq in self.aln: seq_array = np.array(seq) seq_string = str(seq.seq) if (seq_array == '-').sum(): left_gaps = len(seq_string) - len(seq_string.lstrip('-')) seq_array[:left_gaps] = 'N' if (seq_array == '-').sum(): right_gaps = len(seq_string) - len(seq_string.rstrip('-')) if right_gaps: seq_array[-right_gaps:] = 'N' seq.seq = Seq("".join(seq_array)) def translate(self): ''' make alignment of translations ''' self.translations = {} if not hasattr(self, "proteins"): # ensure dictionary to hold annotation self.proteins = {} # add a default translation of the entire sequence unless otherwise specified if len(self.proteins) == 0: self.proteins.update({ 'cds': FeatureLocation(start=0, end=self.aln.get_alignment_length(), strand=1) }) for prot in self.proteins: aa_seqs = [] for seq in self.aln: try: # soon not needed as future biopython version will translate --- into - tmpseq = self.proteins[prot].extract(seq) tmpseq.attributes = seq.attributes internal_gap = np.unique( np.where(np.array(tmpseq) == '-')[0] // 3) aa_seq = np.array( Seq(str(tmpseq.seq).replace('---', 'NNN')).translate()) aa_seq[internal_gap] = '-' tmpseq.seq = Seq("".join(aa_seq)) except: tmpseq.seq = Seq("".join([ x if x in 'ACGT' else 'N' for x in str(tmpseq.seq) ])).translate() print("Trouble translating", seq.id) aa_seqs.append(tmpseq) self.translations[prot] = MultipleSeqAlignment(aa_seqs) def clock_filter(self, root_seq=None, n_iqd=3, max_gaps=1.0, plot=False): ''' remove sequences form the set that are that evolve much faster or slower compared the majority. Regions with predominantly gaps can be removed since this can skew the evolutionary rates. ''' if root_seq is None: # use consensus af = calc_af(self.aln, nuc_alpha) root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)] if type(root_seq) == str and root_seq in self.sequence_lookup: root_seq = np.array(self.sequence_lookup[root_seq]) if max_gaps < 1.0: af = calc_af(self.aln, nuc_alpha) good_pos = af[nuc_alpha.index('-')] < max_gaps else: good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool) date_vs_distance = {} # self.reference_aln = None already set at alignment step for seq in self.aln: date_vs_distance[seq.id] = (seq.attributes['num_date'], np.mean( (np.array(seq) != root_seq)[(np.array(seq) != '-') & (root_seq != '-') & good_pos])) # if seq.id==self.reference.id: # self.reference_aln = seq date_vs_distance_array = np.array(date_vs_distance.values()) from scipy.stats import linregress, scoreatpercentile slope, intercept, rval, pval, stderr = linregress( date_vs_distance_array[:, 0], date_vs_distance_array[:, 1]) print("distance vs time regression:", slope) residuals = (intercept + slope * date_vs_distance_array[:, 0] ) - date_vs_distance_array[:, 1] IQD = scoreatpercentile(residuals, 75) - scoreatpercentile( residuals, 25) if plot: import matplotlib.pyplot as plt plt.ion() plt.scatter(date_vs_distance_array[:, 0], date_vs_distance_array[:, 1], c='g') bad_points = abs(intercept + slope * date_vs_distance_array[:, 0] - date_vs_distance_array[:, 1]) > n_iqd * IQD plt.scatter(date_vs_distance_array[bad_points, 0], date_vs_distance_array[bad_points, 1], c='r') print("before clock filter:", len(self.aln)) tmp = { seq.id: seq for seq in self.aln if abs(intercept + slope * date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1]) < n_iqd * IQD } if self.reference.id not in tmp and self.reference.reference_in_dataset: self.log.notify('adding reference again after clock filter') tmp[self.reference.id] = self.reference_aln self.aln = MultipleSeqAlignment(tmp.values()) print("after clock filter:", len(self.aln)) def diversity_statistics(self): ''' calculate alignment entropy of nucleotide and optionally protein alignments ''' if not hasattr(self, "aln"): self.log.fatal( "Diversity statistics calculated before alignment generated.") return aln_array = np.array(self.aln) self.af = {'nuc': calc_af(self.aln, nuc_alpha)} tmp_af = self.af['nuc'][:-2] / self.af['nuc'][:-2].sum(axis=0) self.entropy = {'nuc': -(tmp_af * np.log(tmp_af + TINY)).sum(axis=0)} if hasattr(self, "translations"): for prot, aln in self.translations.iteritems(): self.af[prot] = calc_af(aln, aa_alpha) tmp_af = self.af[prot][:-2] / self.af[prot][:-2].sum(axis=0) self.entropy[prot] = -(tmp_af * np.log(tmp_af + TINY)).sum(axis=0) def export_diversity(self, fname='entropy.json', indent=None): ''' write the alignment entropy of each alignment (nucleotide and translations) to file ''' if not hasattr(self, "entropy"): self.diversity_statistics() entropy_json = {} for feat in self.entropy: S = [max(0, round(x, 4)) for x in self.entropy[feat]] n = len(S) if feat == 'nuc': entropy_json[feat] = { 'pos': range(0, n), 'codon': [x // 3 for x in range(0, n)], 'val': S } else: entropy_json[feat] = { 'pos': [x for x in self.proteins[feat]][::3], 'codon': [(x - self.proteins[feat].start) // 3 for x in self.proteins[feat]][::3], 'val': S } write_json(entropy_json, fname, indent=indent)
class DisplayedAlignment(object): """ Provides tools for displaying and manipulating an alignment and storing all previous versions """ def __init__(self, alignment): self.displayedColumn = 0 self.alignment = alignment self.alignmentHistory = [alignment[:,:]] self.changed = False self.translated = False self.translationTable = 1 def ParseIndex(self, text): """ Parses a text string specifying a range of rows (taxa) and columns. Expects the text to be in the format used to specify a range from a Bio.Align.MultipleSeqAlignment. Returns indices for the start and stop taxon and the start and stop columns """ taxonStart = 0 taxonStop = len(self.alignment) - 1 columnStart = 0 columnStop = self.alignment.get_alignment_length() - 1 if (',' not in text): self.AlertMessage('Invalid index format. (taxa or columns missing)', 'high') return (-1,-1,-1,-1) else: text = text.strip() indices = text.split(',') if (len(indices) > 2): self.AlertMessage('Invalid index format. (too many fields)', 'high') return (-1,-1,-1,-1) else: if (':' in indices[0]): #there is a range specified in the taxon index taxonIndices = indices[0].split(':') if (taxonIndices[0]): #a start taxon is specified try: taxonStart = int(taxonIndices[0].strip()) except: self.AlertMessage('Invalid index format. (taxon start index not an integer)', 'high') return (-1, -1, -1, -1) if (taxonIndices[1]): #a stop taxon is specified try: taxonStop = int(taxonIndices[1].strip()) except: self.AlertMessage('Invalid index format. (taxon stop index not an integer)', 'high') return (-1, -1, -1, -1) elif (indices[0]): #a single taxon is specified try: taxonStart = int(indices[0].strip()) taxonStop = int(indices[0].strip()) except: self.AlertMessage('Invalid index format. (taxon start or stop index not an integer)', 'high') return (-1, -1, -1, -1) if (':' in indices[1]): #there is a range specified in the taxon index columnIndices = indices[1].split(':') if (columnIndices[0]): #a start taxon is specified try: columnStart = int(columnIndices[0].strip()) except: self.AlertMessage('Invalid index format. (column start index not an integer)', 'high') return (-1, -1, -1, -1) if (columnIndices[1]): #a stop taxon is specified try: columnStop = int(columnIndices[1].strip()) except: self.AlertMessage('Invalid index format. (column stop index not an integer)', 'high') return (-1, -1, -1, -1) elif (indices[1]): #a single taxon is specified try: columnStart = int(indices[1].strip()) columnStop = int(indices[1].strip()) except: self.AlertMessage('Invalid index format. (column start or stop index not an integer)', 'high') return (-1, -1, -1, -1) if ((0 <= taxonStart <= taxonStop) & (0 <= columnStart <= columnStop)): return (taxonStart, taxonStop, columnStart, columnStop) else: self.AlertMessage('Invalid index range. (start > stop or index < 0)', 'high') return (-1,-1,-1,-1) def ColorizeDNA(self, text): """ Colorizes output based on nucleotide """ if (text == 'A'): escape = '\033[92m' # Green elif (text == 'G'): escape = '\033[93m' # Yellow elif (text == 'T'): escape = '\033[91m' # Red elif (text == 'C'): escape = '\033[96m' # Blue else: return text return escape + text + '\033[0m' def ColorizeAA(self, text): """ Colorize output based on amino acid polarity or nonpolarity """ if (text in ['A', 'F', 'H', 'I', 'K', 'L', 'M', 'P', 'R', 'V', 'W']): escape = '\033[91m' # Red elif (text in ['C', 'G', 'N', 'Q', 'S', 'T', 'Y', 'B', 'Z']): escape = '\033[96m' # Blue elif (text in ['D', 'E']): escape = '\033[92m' # Green elif (text in ['X', '*']): escape = '\033[93m' # Yellow else: return text return escape + text + '\033[0m' def AlertMessage(self, text, severity='low'): """ Display an alert message with a tag and color corresponding to the severity of the alert ('low', 'medium', 'high') """ if (severity == 'high'): escape = '\033[91m' # Red tag = '!!!' elif (severity == 'medium'): escape = '\033[93m' # Yellow tag = '***' else: escape = '\033[92m' # Green tag = ' ' print escape + tag, text, tag + '\033[0m' def Show(self, column=0): """ Displays 100 columns of the alignment beginning at 'column' """ if column < 0: column = 0 row = 0 marker = '| : ' * 10 spacer = ' ' * 15 markerRow = spacer + marker if (self.translated == False): indexRow = spacer for index in range(column, column + 100, 10): indexRow = indexRow + str(index).ljust(10) print indexRow print markerRow for sequence in self.alignment[:,column:column + 100]: print '%2d) %10s' % (row, sequence.id), dnaSequence = '' for nucleotide in str(sequence.seq): dnaSequence += self.ColorizeDNA(nucleotide) print dnaSequence, if (column + 100 < self.alignment.get_alignment_length()): print '...' else: print row += 1 print markerRow print indexRow else: indexRow = spacer for index in range(column / 3, (column / 3) + 100, 10): indexRow = indexRow + str(index).ljust(10) print indexRow print markerRow for sequence in self.alignment[:, column:column + 300]: proteinSequence = '' for codonPosition in range(0, len(sequence), 3): codon = sequence.seq[codonPosition:codonPosition + 3] if (str(codon) == '---'): proteinSequence += '-' elif ('-' in codon): proteinSequence += '?' else: proteinSequence += self.ColorizeAA(str(codon.translate(table = self.translationTable))) print '%2d) %10s %s' % (row, sequence.id, proteinSequence), if (column + 300 < self.alignment.get_alignment_length()): print '...' else: print row += 1 print markerRow print indexRow self.displayedColumn = column def BackupAlignment(self): """ Stores the current alignment state to the alignment change history """ self.alignmentHistory.append(self.alignment[:,:]) def UndoChanges(self): """ Reverts to the previous state in the alignment change history. Does not effect which column index is displayed or whether the sequence is displayed as translated or not since those are not changes to the alignment. """ if (len(self.alignmentHistory) > 1): self.alignmentHistory.pop() self.alignment = self.alignmentHistory[-1][:,:] self.Show(self.displayedColumn) else: self.AlertMessage('Nothing to undo.', 'low') def DeleteRange(self, rangeText, silent=False): """ Removes a row and column range from the alignment """ startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(rangeText) if (self.translated == True): startColumn = startColumn * 3 stopColumn = (stopColumn * 3) + 2 if (startTaxon >= 0): #Make sure we had a valid range changeLength = 0 deleteTaxon = False if ((startColumn == 0) & (stopColumn == len(self.alignment[0]) - 1)): deleteTaxon = True if ((startTaxon > 0) | (stopTaxon < len(self.alignment) - 1)): changeLength = (stopColumn - startColumn) + 1 taxon = 0 newSequences = [] for Sequence in self.alignment: if (taxon in range(startTaxon, stopTaxon + 1)): if (not deleteTaxon): if (startColumn > 0): Sequence.seq = Sequence.seq[:startColumn] + Sequence.seq[stopColumn + 1:] else: Sequence.seq = Sequence.seq[stopColumn + 1:] if (changeLength): Sequence.seq = Sequence.seq + Seq('-' * changeLength) newSequences.append(Sequence) else: newSequences.append(Sequence) taxon += 1 self.alignment = MultipleSeqAlignment(newSequences) if (not silent): self.Show(self.displayedColumn) self.BackupAlignment() def ModifyRange(self, rangeText, nucleotide='-'): """ Changes the nucleotides in a row and column range to a specified nucleotide. Has no effect when the alignment is translated since the corresponding change to the underlying nucleotide alignment would be ambiguous at best. """ nucleotide = nucleotide.upper() if (self.translated == True): self.AlertMessage("Can't modify protein sequences.", 'medium') elif (nucleotide not in ['A', 'G', 'C', 'T', 'R', 'K', 'S', 'W', 'M', 'Y', 'D', 'V', 'B', 'H', 'N', '-']): self.AlertMessage('Invalid nucleotide. (only AGTC- and IUB nucleotide codes are permitted)', 'high') else: startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(rangeText) if (startTaxon >= 0): #Make sure we have a valid range taxon = 0 newSequences = [] modificationLength = (stopColumn - startColumn) + 1 for Sequence in self.alignment: if (taxon in range(startTaxon, stopTaxon + 1)): if (startColumn > 0): Sequence.seq = Sequence.seq[:startColumn] + Seq(nucleotide * modificationLength) + Sequence.seq[stopColumn + 1:] else: Sequence.seq = Seq(nucleotide * modificationLength) + Sequence.seq[stopColumn + 1:] newSequences.append(Sequence) taxon += 1 self.alignment = MultipleSeqAlignment(newSequences) self.Show(self.displayedColumn) self.BackupAlignment() def InsertRange(self, rangeText): """ Inserts a row and column range into the alignment and fills it with gaps ('-' for nucleotides or '---' for translated alignments) """ startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(rangeText) if (self.translated == True): startColumn = startColumn * 3 stopColumn = (stopColumn * 3) + 2 if (startTaxon >= 0): #Make sure we had a valid range changeLength = (stopColumn - startColumn) + 1 taxon = 0 newSequences = [] for Sequence in self.alignment: if (taxon in range(startTaxon, stopTaxon + 1)): if (startColumn > 0): Sequence.seq = Sequence.seq[:startColumn] + Seq('-' * changeLength) + Sequence.seq[startColumn:] else: Sequence.seq = Seq('-' * changeLength) + Sequence.seq[:] else: Sequence.seq = Sequence.seq + Seq('-' * changeLength) newSequences.append(Sequence) taxon +=1 self.alignment = MultipleSeqAlignment(newSequences) self.Show(self.displayedColumn) self.BackupAlignment() def Jump(self, column): """ Moves the displayed column to a specified column index """ if (self.translated == True): column = column * 3 self.Show(column) def ScrollRight(self, offset=100): """ Scroll the display 'offset' columns to the right """ if (self.translated == True): offset = offset * 3 self.Show(self.displayedColumn + offset) def ScrollLeft(self, offset=100): """ Scroll the display 'offset' columns to the left """ if (self.translated == True): offset = offset * 3 self.Show(self.displayedColumn - offset) def Reverse(self): """ Reverses the order of the columns in the alignment. Has no effect on translated sequences. """ if (self.translated == False): self.alignment = self.alignment[:,::-1] self.Show(self.displayedColumn) self.BackupAlignment() else: self.AlertMessage("Can't reverse protein sequences.", 'medium') def Complement(self): """ Give the complement of the alignment. Has no effect on translated sequences. """ if (self.translated == False): for i in range(len(self.alignment)): self.alignment[i].seq = self.alignment[i].seq.complement() self.Show(self.displayedColumn) self.BackupAlignment else: self.AlertMessage("Can't complement protein sequences.", 'medium') def ReverseComplement(self): """ Reverse and complement the alignment. Has no effect on translated sequences. """ if (self.translated == False): for i in range(len(self.alignment)): self.alignment[i].seq = self.alignment[i].seq.reverse_complement() self.Show(self.displayedColumn) self.BackupAlignment() else: self.AlertMessage("Can't reverse-complement protein sequences.", 'medium') def Translate(self, translationTable=11): """ Switch to displaying and manipulating the sequence as a protein sequence. Still works on translated sequences if a different translation table is specified, otherwise it backtranslated translated sequences. """ if ((self.translated == False) | ((self.translated == True) & (self.translationTable != translationTable))): self.translated = True self.translationTable = translationTable self.displayedColumn = self.displayedColumn - (self.displayedColumn % 3) self.Show(self.displayedColumn) else: self.BackTranslate() def BackTranslate(self): """ Revert to displaying and manipulating the sequence as a dna sequence. Has no effect if the sequence is already dna. """ if (self.translated == True): self.translated = False self.Show(self.displayedColumn) else: self.AlertMessage("Can't back-translate. Alignment contains DNA sequences", 'medium') def Save(self, fileName='alignment.phy', alignmentFormat='phylip'): """ Write alignment to disk """ AlignIO.write(self.alignment, fileName, alignmentFormat) self.AlertMessage('Saved alignment to ' + fileName + ' in ' + alignmentFormat + ' format.', 'low') def CleanUp(self): """ Condense the alignment by removing any columns that contain spaces in all taxa. """ blankColumnPattern = re.compile('^-*$') blankColumns = [] for columnIndex in range(self.alignment.get_alignment_length() - 1): columnValues = self.alignment[:,columnIndex] match = blankColumnPattern.search(columnValues) if (match): blankColumns.append(str(columnIndex)) for column in blankColumns[::-1]: self.DeleteRange(',' + str(column), True) self.Show(self.displayedColumn) self.BackupAlignment()
class GenericAlign(object): """docstring for Align""" def __init__(self, input): self.input = input self.alignment = None self.trimmed_alignment = None self.perfect_trimmed_alignment = None def _clean(self, outtemp): if type(outtemp) is list: for f in outtemp: os.remove(f) else: os.remove(outtemp) # cleanup temp file try: os.remove(self.input) except: pass def _find_ends(self, forward=True): """determine the first (or last) position where all reads in an alignment start/stop matching""" if forward: theRange = xrange(self.alignment.get_alignment_length()) else: theRange = reversed(xrange(self.alignment.get_alignment_length())) for col in theRange: if '-' in self.alignment.get_column(col): pass else: break return col def _base_checker(self, bases, sequence, loc): """ensure that any trimming that occurs does not start beyong the end of the sequence being trimmed""" # deal with the case where we just want to measure out from the # middle of a particular sequence if len(loc) == 1: loc = (loc, loc) if not bases > len(sequence.seq[:loc[0]]) and \ not bases > len(sequence.seq[loc[1]:]): return True def _record_formatter(self, temp): """return a string formatted as a biopython sequence record""" temp_record = SeqRecord(temp) return temp_record def _alignment_summary(self, alignment): """return summary data for an alignment object using the AlignInfo class from BioPython""" summary = AlignInfo.SummaryInfo(alignment) consensus = summary.dumb_consensus() return summary, consensus def _read(self, format): """read an alignment from the CLI - largely for testing purposes""" self.alignment = AlignIO.read(open(self.input, 'rU'), format) def get_probe_location(self): '''Pull the probe sequence from an alignment object and determine its position within the read''' # probe at bottom => reverse order for record in self.alignment[::-1]: if record.id == 'probe': start = re.search('^-*', str(record.seq)) end = re.search('-*$', str(record.seq)) # should be first record break # ooh, this seems so very backwards self.ploc = ( start.end(), end.start(), ) def running_average(self, window_size, threshold, proportion=0.3, k=None, running_probe=False): # iterate across the columns of the alignment and determine presence # or absence of base-identity in the column differences = [] members = len(self.alignment) if not running_probe: for column in xrange(self.alignment.get_alignment_length()): column_values = self.alignment[:, column] # get the count of different bases in a column (converting # it to a set gets only the unique values) column_list = list(column_values) # use proportional removal of gaps if column_list.count('-') <= int(round(proportion * members, 0)): column_list = [i for i in column_list if i != '-'] #pdb.set_trace() if len(set(column_list)) > 1: differences.append(0) else: differences.append(1) else: for column in xrange(self.alignment.get_alignment_length()): column_values = list(self.alignment[:, column]) # drop the index of the probe from the column_values del column_values[k] # get the count of different bases in a column (converting # it to a set gets only the unique values). # # no need to convert to a list here because it is already one if len(set(column_values)) > 1: differences.append(0) else: differences.append(1) differences = numpy.array(differences) weight = numpy.repeat(1.0, window_size) / window_size running_average = numpy.convolve( differences, weight)[window_size - 1:-(window_size - 1)] good = numpy.where(running_average >= threshold)[0] # remember to add window size onto end of trim try: start_clip, end_clip = good[0], good[-1] + window_size except IndexError: start_clip, end_clip = None, None return start_clip, end_clip def trim_alignment(self, method='edges', remove_probe=None, bases=None, consensus=True, window_size=20, threshold=0.5, proportion=0.3): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold, proportion=proportion) elif method == 'running-probe': # get position of probe for k, v in enumerate(self.alignment): if v.name == 'probe': break else: pass start, end = self.running_average(window_size, threshold, proportion, k, True) #pdb.set_trace() if method == 'notrim': self.trimmed_alignment = self.alignment else: # create a new alignment object to hold our alignment self.trimmed_alignment = MultipleSeqAlignment( [], Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running' or method == 'running-probe') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method if start >= 0 and end: self.trimmed_alignment.append(sequence[start:end]) else: self.trimmed_alignment = None break elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence) / 2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point - bases:mid_point + bases]) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0] - bases:self.ploc[1] + bases]) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self. ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0] - bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1] + bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus and self.trimmed_alignment: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment) if not self.trimmed_alignment: print "\tAlignment {0} dropped due to trimming".format( self.alignment._records[0].description.split('|')[1]) def trim_ambiguous_bases(self): """snip ambiguous bases from a trimmed_alignment""" ambiguous_bases = [] # do this by finding all ambiguous bases and then snipping the largest # chunk with no ambiguous bases from the entire alignment if not self.trimmed_alignment: self.perfect_trimmed_alignment = self.trimmed_alignment else: for column in xrange( 0, self.trimmed_alignment.get_alignment_length()): if 'N' in self.trimmed_alignment[:, column]: ambiguous_bases.append(column) maximum = 0 maximum_pos = None #pdb.set_trace() if not ambiguous_bases: self.perfect_trimmed_alignment = self.trimmed_alignment if ambiguous_bases: # prepend and append the start and end of the sequence so consider # those chunks outside the stop and start of ambiguous base runs. ambiguous_bases.insert(0, 0) ambiguous_bases.append( self.trimmed_alignment.get_alignment_length() - 1) # create a new alignment object to hold our alignment self.perfect_trimmed_alignment = \ MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for pos in xrange(len(ambiguous_bases)): if pos + 1 < len(ambiguous_bases): difference = ambiguous_bases[pos + 1] - \ ambiguous_bases[pos] if difference > maximum: maximum = difference maximum_pos = (pos, pos + 1) else: pass # make sure we catch cases where there is not best block if maximum_pos: for sequence in self.trimmed_alignment: self.perfect_trimmed_alignment.append( sequence[ambiguous_bases[maximum_pos[0]] + 1:ambiguous_bases[maximum_pos[1]]]) else: self.perfect_trimmed_alignment = None
def find_hypermutants(aln, thres=-2): ''' custom routine to find hypermutated sequences, it splits the alignment into sub alignments RNA, Good sequences, hyper mutated, suspicious. In addition, it returns a subset of sequences that translate without stop codon (assuming the p17 amplicon) ''' isRNA = np.array([True if seq.id[:4]=="days" else False for seq in aln], dtype=bool) RNAaln = MultipleSeqAlignment([aln[i].upper() for i in np.where(isRNA)[0]]) DNAaln = MultipleSeqAlignment([aln[i].upper() for i in np.where(~isRNA)[0]]) # load the RNA SNP freuqencies to determine positions variable at the RNA level # those are disregarded for the hypermutation classification RNAaf = np.zeros((len(alpha), RNAaln.get_alignment_length())) for seq in RNAaln: nucs = np.fromstring(str(seq.seq).upper(), 'S1') freq = float(seq.description.split('frequency_')[1].split('%')[0])*0.01 for ni,nuc in enumerate(alpha): RNAaf[ni, nucs==nuc]+=freq RNAaf/=RNAaf.sum(axis=0) # if the maximal allele frequency is above 0.99, positions are considered conserved conserved_pos = RNAaf[:4].max(axis=0)>0.99 consensus = np.array([alpha[ai] for ai in RNAaf.argmax(axis=0)]) mut_hist = {'good':[], 'hyper':[], 'suspicious':[]} DNAaln_array = np.array(DNAaln) good_seqs = [] hyper_muts = [] suspicious = [] nostop = [] mut_dict = {} ii=0 for a in alpha: for b in alpha: if a!=b: mut_dict[a+'->'+b] = ii ii+=1 for si,seq in enumerate(DNAaln): muts = (consensus!=DNAaln[si])&conserved_pos&(DNAaln_array[si]!='-') tmp = defaultdict(int) #print(seq.name, np.where(muts)[0]) total = muts.sum() mut_counts = np.zeros(30) for mi in np.where(muts)[0]: mut = consensus[mi]+'->'+DNAaln[si,mi] tmp[mut]+=1 mut_counts[mut_dict[mut]]+=1 if total<10 and (total<4 or tmp['G->A']<0.5*total): good_seqs.append(seq) mut_hist['good'].append(mut_counts) elif tmp['G->A']>=0.5*total: hyper_muts.append(seq) mut_hist['hyper'].append(mut_counts) else: suspicious.append(seq) mut_hist['suspicious'].append(mut_counts) if total<20 and seq.seq.ungap('-')[20:].translate().count('*')==0: nostop.append(seq) for k in mut_hist: mut_hist[k] = np.array(mut_hist[k]) return RNAaln, MultipleSeqAlignment(good_seqs), MultipleSeqAlignment(hyper_muts),\ MultipleSeqAlignment(suspicious), MultipleSeqAlignment(nostop), mut_hist
#print length for m in missed: if partnum == "-prot": temp.append(SeqRecord(Seq("X"*length, alphabet = generic_protein), id=m)) #add dummies else: temp.append(SeqRecord(Seq("?"*(length), Gapped(IUPAC.ambiguous_dna)), id=m)) #add dummies counter = 0 if partnum == "-prot": temp2 = MultipleSeqAlignment([], alphabet = generic_protein) else: temp2 = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna)) for aliseq in align: for tempseq in temp: if aliseq.id == tempseq.id: temp2.append(aliseq + tempseq) start = align.get_alignment_length()+1 end = align.get_alignment_length()+length prog = "working on partition "+str(fn)+": starts "+str(start)+", ends "+str(end) sys.stdout.write(prog+"\r") sys.stdout.flush() align = temp2 counter += align.get_alignment_length() if pf2opt == "-pf2y": if partnum == "-3": print >> pf2cfg, fn[:-4]+"_1 = "+str(start)+" - "+str(end)+"\\3;" print >> pf2cfg, fn[:-4]+"_2 = "+str(start+1)+" - "+str(end)+"\\3;" print >> pf2cfg, fn[:-4]+"_3 = "+str(start+2)+" - "+str(end)+"\\3;" elif partnum == "-1": print >> pf2cfg, fn[:-4]+" = "+str(start)+" - "+str(end)+";" elif partnum == "-prot": print >> pf2cfg, fn[:-4]+" = "+str(start)+" - "+str(end)+";"
toRemove = open(args.taxa) TaxatoRemove = [] for line in toRemove: taxon = line.strip("\n") TaxatoRemove.append(taxon) keepers = [] for item in allTaxa: if item not in TaxatoRemove: keepers.append(seq_dict[item]) # clean out gaps created by removing sequences if sequences are part of a multiple sequence alignment if 'True' in args.alignment: rawAlignment = MultipleSeqAlignment(keepers) goodColumns = [] for x in range(0, rawAlignment.get_alignment_length()): column = rawAlignment[:, x] if column.count("-") < (len(rawAlignment) - 2): slice = rawAlignment[:, x:x + 1] goodColumns.append(slice) goodColumnsAlignment = rawAlignment[:, 0:0] for column in goodColumns: goodColumnsAlignment = goodColumnsAlignment + column AlignIO.write(goodColumnsAlignment, args.output, "fasta") # otherwise, just write the sequences else: SeqIO.write(keepers, args.output, "fasta")
max_gaps = gapPercent * Query_len #Select the sequences selected_sequences = [] for r in alignment: if r.seq.count('-') < max_gaps: selected_sequences += [r] print 'Selected sequences: ', len(selected_sequences) new_alignment = MultipleSeqAlignment(selected_sequences) if Remove_gap_only_columns: l = len(selected_sequences) ee = 0 final_alignment = [] for i in range(new_alignment.get_alignment_length()): s = new_alignment[:, i] if s.count('.') == l: ee += 1 else: if type(final_alignment) == list: final_alignment = new_alignment[:, i:i + 1] else: final_alignment += new_alignment[:, i:i + 1] print 'Removed gap only columns: ', ee print AlignIO.write(final_alignment, out_file, 'fasta') print final_alignment elif Remove_all_insertions: import re handle = open(out_file, 'w+') for a in new_alignment:
alignmentStart = AlignIO.read(open(filein, "r"), "fasta") # cree un nouvelle alignement avec que les souches voulus: keepListRecord = [] for record in alignmentStart: if record.id not in listKeepSouche and args.listKeepFile == "ALL": listKeepSouche.append(record.id) #print(record.id) if record.id in listKeepSouche: keepListRecord.append(record) tableauSoucheName.append(record.id) if record.id not in dicoSeqSNP.keys(): dicoSeqSNP[record.id] = "" alignment = MultipleSeqAlignment(keepListRecord) lenAlignement = int(alignment.get_alignment_length()) #print(alignment) #print(tableauSoucheName) #print(len(tableauSoucheName)) for indice in range(0, lenAlignement): tab = list(alignment[:, indice]) #print(tab) nbO = tab.count(tab[0]) nbA = tab.count("A") nbC = tab.count("C") nbT = tab.count("T") nbG = tab.count("G") nbN = tab.count("N") + tab.count("n") nbGap = tab.count("-") sommeACTG = nbA + nbC + nbT + nbG
class sequence_set(object): """sequence_set subsamples a set of sequences, aligns them and exports variability statistics""" def __init__(self, fname=None, reference_seq=None, **kwarks): super(sequence_set, self).__init__() self.kwarks = kwarks self.nthreads = 2 if fname is not None and os.path.isfile(fname): with myopen(fname) as seq_file: self.all_seqs = { x.name: x for x in SeqIO.parse(seq_file, 'fasta') } elif 'virus' in kwarks: self.from_vdb(kwarks['virus']) else: print('no input sequences found -- empty sequence set') return if 'run_dir' not in kwarks: import random self.run_dir = '_'.join([ 'temp', time.strftime('%Y%m%d-%H%M%S', time.gmtime()), str(random.randint(0, 1000000)) ]) else: self.run_dir = kwarks['run_dir'] if reference_seq is not None: if type(reference_seq) is str and reference_seq in self.all_seqs: self.reference_seq = self.all_seqs[reference_seq] else: self.reference_seq = reference_seq else: self.reference_seq = None def parse(self, fields, sep='|', strip='_'): ''' split the sequence description and add annotations to sequences ''' for seq in self.all_seqs.values(): if not hasattr(seq, "attributes"): seq.attributes = {} words = map(lambda x: x.strip(strip), seq.description.replace(">", "").split(sep)) for ii, val in enumerate(words): if ii in fields: if val not in ["", "-"]: seq.attributes[fields[ii]] = val else: seq.attributes[fields[ii]] = "" if 'strain' in fields.values(): self.all_seqs = { seq.attributes['strain']: seq for seq in self.all_seqs.values() } for seq in self.all_seqs.values(): seq.id = seq.attributes['strain'] seq.name = seq.attributes['strain'] def ungap(self): ''' remove previously existing gaps and make sure all sequences are upper case ''' for seq in self.all_seqs.values(): seq.seq = seq.seq.ungap('-').upper() def parse_date(self, fmts, prune=True): if not hasattr(self.all_seqs.values()[0], "attributes"): print("parse meta info first") return from datetime import datetime for seq in self.all_seqs.values(): if 'date' in seq.attributes and seq.attributes['date'] != '': for fmt in fmts: try: if 'XX' in seq.attributes['date']: min_date, max_date = ambiguous_date_to_date_range( seq.attributes['date'], fmt) seq.attributes['raw_date'] = seq.attributes['date'] seq.attributes['num_date'] = np.array( (num_date(min_date), num_date(max_date))) seq.attributes['date'] = min_date else: if callable(fmt): tmp = fmt(seq.attributes['date']) else: try: tmp = datetime.strptime( seq.attributes['date'], fmt).date() except: tmp = seq.attributes['date'] seq.attributes['raw_date'] = seq.attributes['date'] seq.attributes['num_date'] = num_date(tmp) seq.attributes['date'] = tmp break except: continue if prune: self.filter(func=lambda x: 'date' in x.attributes and type( x.attributes['date']) != str) def filter(self, func): self.all_seqs = { key: seq for key, seq in self.all_seqs.iteritems() if func(seq) } def clock_filter(self, root_seq=None, n_iqd=3, max_gaps=1.0, plot=False): ''' remove sequences form the set that are that evolve much faster or slower compared the majority. Regions with predominantly gaps can be removed since this can skew the evolutionary rates. ''' from Bio.Align import MultipleSeqAlignment if root_seq is None: # use consensus af = calc_af(self.aln, nuc_alpha) root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)] if type(root_seq) == str and root_seq in self.sequence_lookup: root_seq = np.array(self.sequence_lookup[root_seq]) if max_gaps < 1.0: af = calc_af(self.aln, nuc_alpha) good_pos = af[nuc_alpha.index('-')] < max_gaps else: good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool) date_vs_distance = {} self.reference_aln = None for seq in self.aln: date_vs_distance[seq.id] = (seq.attributes['num_date'], np.mean( (np.array(seq) != root_seq)[(np.array(seq) != '-') & (root_seq != '-') & good_pos])) if seq.id == self.reference.id: self.reference_aln = seq date_vs_distance_array = np.array(date_vs_distance.values()) from scipy.stats import linregress, scoreatpercentile slope, intercept, rval, pval, stderr = linregress( date_vs_distance_array[:, 0], date_vs_distance_array[:, 1]) print("distance vs time regression:", slope) residuals = (intercept + slope * date_vs_distance_array[:, 0] ) - date_vs_distance_array[:, 1] IQD = scoreatpercentile(residuals, 75) - scoreatpercentile( residuals, 25) if plot: import matplotlib.pyplot as plt plt.ion() plt.scatter(date_vs_distance_array[:, 0], date_vs_distance_array[:, 1], c='g') bad_points = abs(intercept + slope * date_vs_distance_array[:, 0] - date_vs_distance_array[:, 1]) > n_iqd * IQD plt.scatter(date_vs_distance_array[bad_points, 0], date_vs_distance_array[bad_points, 1], c='r') print("before clock filter:", len(self.aln)) tmp = { seq.id: seq for seq in self.aln if abs(intercept + slope * date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1]) < n_iqd * IQD } if self.reference.id not in tmp and self.reference_aln is not None: print('adding reference again after clock filter') tmp[self.reference.id] = self.reference_aln self.aln = MultipleSeqAlignment(tmp.values()) print("after clock filter:", len(self.aln)) def subsample(self, category=None, priority=None, threshold=None, repeated=False, forced_strains=[]): ''' produce a useful set of sequences from the raw input. arguments: category -- callable that assigns each sequence to a category for subsampling priority -- callable that assigns each sequence a priority to be included in the final sample. this is applied independently in each category threshold -- callable that determines the number of sequences from each category that is included in the final set. takes arguments, cat and seq alternatively can be an int forced_strains -- list of of strain names that should always be included (set to high priorty) ''' # define filter criteria if not specified if category is None: category = lambda x: (x.attributes['date'].year, x.attributes[ 'date'].month) if priority is None: priority = lambda x: np.random.random() if threshold is None: threshold = lambda x: 5 elif type(threshold) is int: print("using threshold:", threshold) tmp = threshold threshold = lambda x: tmp # if we do repeated subsampling, subsamples seqs, otherwise all_seqs self.sequence_categories = defaultdict(list) if repeated: seqs_to_subsample = self.seqs.values() else: seqs_to_subsample = self.all_seqs.values() # sort sequences into categories and assign priority score for seq in seqs_to_subsample: seq._priority = priority(seq) if seq.id in forced_strains: seq._priority = 1.0 self.sequence_categories[category(seq)].append(seq) # sample and record the degree to which a category is under_sampled self.seqs = {} for cat, seqs in self.sequence_categories.iteritems(): under_sampling = min(1.00, 1.0 * len(seqs) / threshold(cat)) for s in seqs: s.under_sampling = under_sampling seqs.sort(key=lambda x: x._priority, reverse=True) self.seqs.update( {seq.id: seq for seq in seqs[:threshold((cat, seqs))]}) def align(self): ''' align sequences using mafft ''' from Bio import AlignIO from Bio.Align import MultipleSeqAlignment make_dir(self.run_dir) os.chdir(self.run_dir) ref_in_set = self.reference_seq.name in self.seqs if ref_in_set: out_seqs = self.seqs.values() else: out_seqs = self.seqs.values() + [self.reference_seq] print("align: reference in set", ref_in_set) SeqIO.write(out_seqs, "temp_in.fasta", "fasta") os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta") tmp_aln = AlignIO.read('temp_out.fasta', 'fasta') self.sequence_lookup = {seq.id: seq for seq in tmp_aln} # add attributes to alignment for seqid, seq in self.seqs.iteritems(): self.sequence_lookup[seqid].attributes = seq.attributes self.aln = MultipleSeqAlignment([ s for s in tmp_aln if s.name != self.reference_seq.name or ref_in_set ]) os.chdir('..') remove_dir(self.run_dir) def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO, SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune == False: aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id, "has premature stops, discarding") bad_seq += '*' in str(tempseq)[:-1] print('Number of sequences with stops:', bad_seq, 'out of total', len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname, 'fasta') if alignment_tool == 'muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5] + 'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta") elif alignment_tool == 'mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:', alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id: seq for seq in self.aln} # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir) def strip_non_reference(self): ungapped = np.array( self.sequence_lookup[self.reference_seq.name]) != '-' from Bio.Seq import Seq for seq in self.aln: seq.seq = Seq("".join(np.array(seq)[ungapped])) def diversity_statistics(self): ''' calculate alignment entropy of nucleotide and optionally protein alignments ''' if not hasattr(self, "aln"): print("calculate alignment first") return aln_array = np.array(self.aln) self.af = {'nuc': calc_af(self.aln, nuc_alpha)} tmp_af = self.af['nuc'][:-2] / self.af['nuc'][:-2].sum(axis=0) self.entropy = {'nuc': -(tmp_af * np.log(tmp_af + TINY)).sum(axis=0)} if hasattr(self, "translations"): for prot, aln in self.translations.iteritems(): self.af[prot] = calc_af(aln, aa_alpha) tmp_af = self.af[prot][:-2] / self.af[prot][:-2].sum(axis=0) self.entropy[prot] = -(tmp_af * np.log(tmp_af + TINY)).sum(axis=0) def translate(self, proteins=None): ''' make alignment of translations ''' from Bio.SeqFeature import FeatureLocation from Bio.Seq import Seq from Bio.Align import MultipleSeqAlignment if not hasattr( self, "proteins" ): # generate dictionaries to hold annotation and translation self.translations = {} self.proteins = {} # add a default translation of the entire sequence unless otherwise specified if proteins is None and len(self.proteins) == 0: self.proteins.update({ 'cds': FeatureLocation(start=0, end=self.aln.get_alignment_length(), strand=1) }) else: self.proteins.update(proteins) for prot in self.proteins: aa_seqs = [] for seq in self.aln: try: # soon not needed as future biopython version will translate --- into - tmpseq = self.proteins[prot].extract(seq) tmpseq.attributes = seq.attributes tmpseq.seq = Seq( str( Seq(str(tmpseq.seq).replace( '---', 'NNN')).translate()).replace('X', '-')) except: tmpseq.seq = Seq( str( Seq("".join([ x if x in 'ACGT' else 'N' for x in str(tmpseq.seq) ])).translate()).replace('X', '-')) print("Trouble translating", seq.id) aa_seqs.append(tmpseq) self.translations[prot] = MultipleSeqAlignment(aa_seqs) def export_diversity(self, fname='entropy.json'): ''' write the alignment entropy of each alignment (nucleotide and translations) to file ''' if not hasattr(self, "entropy"): self.diversity_statistics() entropy_json = {} for feat in self.entropy: S = [max(0, round(x, 4)) for x in self.entropy[feat]] n = len(S) if feat == 'nuc': entropy_json[feat] = { 'pos': range(0, n), 'codon': [x // 3 for x in range(0, n)], 'val': S } else: entropy_json[feat] = { 'pos': [x for x in self.proteins[feat]][::3], 'codon': [(x - self.proteins[feat].start) // 3 for x in self.proteins[feat]][::3], 'val': S } write_json(entropy_json, fname, indent=None)
class GenericAlign(object): """docstring for Align""" def __init__(self, input): self.input = input self.alignment = None self.trimmed_alignment = None self.perfect_trimmed_alignment = None def _clean(self, outtemp): if type(outtemp) is list: for f in outtemp: os.remove(f) else: os.remove(outtemp) # cleanup temp file try: os.remove(self.input) except: pass def _find_ends(self, forward=True): """determine the first (or last) position where all reads in an alignment start/stop matching""" if forward: theRange = xrange(self.alignment.get_alignment_length()) else: theRange = reversed(xrange(self.alignment.get_alignment_length())) for col in theRange: if '-' in self.alignment.get_column(col): pass else: break return col def _base_checker(self, bases, sequence, loc): """ensure that any trimming that occurs does not start beyong the end of the sequence being trimmed""" # deal with the case where we just want to measure out from the # middle of a particular sequence if len(loc) == 1: loc = (loc, loc) if not bases > len(sequence.seq[:loc[0]]) and \ not bases > len(sequence.seq[loc[1]:]): return True def _record_formatter(self, temp): """return a string formatted as a biopython sequence record""" temp_record = SeqRecord(temp) return temp_record def _alignment_summary(self, alignment): """return summary data for an alignment object using the AlignInfo class from BioPython""" summary = AlignInfo.SummaryInfo(alignment) consensus = summary.dumb_consensus() return summary, consensus def _read(self, format): """read an alignment from the CLI - largely for testing purposes""" self.alignment = AlignIO.read(open(self.input, 'rU'), format) def get_probe_location(self): '''Pull the probe sequence from an alignment object and determine its position within the read''' # probe at bottom => reverse order for record in self.alignment[::-1]: if record.id == 'probe': start = re.search('^-*', str(record.seq)) end = re.search('-*$', str(record.seq)) # should be first record break # ooh, this seems so very backwards self.ploc = (start.end(), end.start(),) def running_average(self, window_size, threshold, proportion=0.3, k=None, running_probe=False): # iterate across the columns of the alignment and determine presence # or absence of base-identity in the column differences = [] members = len(self.alignment) if not running_probe: for column in xrange(self.alignment.get_alignment_length()): column_values = self.alignment[:, column] # get the count of different bases in a column (converting # it to a set gets only the unique values) column_list = list(column_values) # use proportional removal of gaps if column_list.count('-') <= int(round(proportion * members, 0)): column_list = [i for i in column_list if i != '-'] #pdb.set_trace() if len(set(column_list)) > 1: differences.append(0) else: differences.append(1) else: for column in xrange(self.alignment.get_alignment_length()): column_values = list(self.alignment[:, column]) # drop the index of the probe from the column_values del column_values[k] # get the count of different bases in a column (converting # it to a set gets only the unique values). # # no need to convert to a list here because it is already one if len(set(column_values)) > 1: differences.append(0) else: differences.append(1) differences = numpy.array(differences) weight = numpy.repeat(1.0, window_size) / window_size running_average = numpy.convolve(differences, weight)[window_size - 1:-(window_size - 1)] good = numpy.where(running_average >= threshold)[0] # remember to add window size onto end of trim try: start_clip, end_clip = good[0], good[-1] + window_size except IndexError: start_clip, end_clip = None, None return start_clip, end_clip def trim_alignment(self, method='edges', remove_probe=None, bases=None, consensus=True, window_size=20, threshold=0.5, proportion=0.3): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold, proportion=proportion) elif method == 'running-probe': # get position of probe for k, v in enumerate(self.alignment): if v.name == 'probe': break else: pass start, end = self.running_average(window_size, threshold, proportion, k, True) #pdb.set_trace() if method == 'notrim': self.trimmed_alignment = self.alignment else: # create a new alignment object to hold our alignment self.trimmed_alignment = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running' or method == 'running-probe') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method if start >= 0 and end: self.trimmed_alignment.append(sequence[start:end]) else: self.trimmed_alignment = None break elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence) / 2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point - bases:mid_point + bases] ) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0] - bases:self.ploc[1] + bases] ) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0] - bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1] + bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus and self.trimmed_alignment: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment) if not self.trimmed_alignment: print "\tAlignment {0} dropped due to trimming".format(self.alignment._records[0].description.split('|')[1]) def trim_ambiguous_bases(self): """snip ambiguous bases from a trimmed_alignment""" ambiguous_bases = [] # do this by finding all ambiguous bases and then snipping the largest # chunk with no ambiguous bases from the entire alignment if not self.trimmed_alignment: self.perfect_trimmed_alignment = self.trimmed_alignment else: for column in xrange(0, self.trimmed_alignment.get_alignment_length()): if 'N' in self.trimmed_alignment[:,column]: ambiguous_bases.append(column) maximum = 0 maximum_pos = None #pdb.set_trace() if not ambiguous_bases: self.perfect_trimmed_alignment = self.trimmed_alignment if ambiguous_bases: # prepend and append the start and end of the sequence so consider # those chunks outside the stop and start of ambiguous base runs. ambiguous_bases.insert(0, 0) ambiguous_bases.append(self.trimmed_alignment.get_alignment_length() - 1) # create a new alignment object to hold our alignment self.perfect_trimmed_alignment = \ MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for pos in xrange(len(ambiguous_bases)): if pos + 1 < len(ambiguous_bases): difference = ambiguous_bases[pos + 1] - \ ambiguous_bases[pos] if difference > maximum: maximum = difference maximum_pos = (pos, pos + 1) else: pass # make sure we catch cases where there is not best block if maximum_pos: for sequence in self.trimmed_alignment: self.perfect_trimmed_alignment.append( sequence[ambiguous_bases[maximum_pos[0]] + 1:ambiguous_bases[maximum_pos[1]]] ) else: self.perfect_trimmed_alignment = None
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input files are from s6 genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/") ### mkdir output directory for s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/" if os.path.isdir(output_directory) == False: os.makedirs(output_directory) ### iterate each gene for file in os.listdir(genes_result_s6): if file != ".DS_Store": output_directory_file = output_directory + file fasta_name = genes_result_s6 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " +sequence) alignment = AlignIO.read(sequence, 'fasta') # print(alignment) ### generate a new alignment sequences without outgroups. align = MultipleSeqAlignment([]) for record in alignment: if record.id not in outgroups: # print(record.id) # print(record.seq) temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id)) # print(temp_seq) align.extend([temp_seq]) print(align) # print(align.get_alignment_length()) total_wrong_poly_sites = [] ### change alignment to an array. align_array = np.array([list(rec) for rec in align]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = align.get_alignment_length() ### using 20bp-long sliding windows. for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position) > float(Max_p_sites): print(column_position) total_wrong_poly_sites = total_wrong_poly_sites + column_position #print(total_wrong_poly_sites) ### generate the unique positions total_wrong_poly_sites = total_wrong_poly_sites + list(range(10)) total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length)) ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species. unique_wrong_sites = list(np.unique(total_wrong_poly_sites)) print(len(unique_wrong_sites)) # sum2 = alignment[:, total_length:total_length + 1] # for i in unique_wrong_sites: # sum2 = sum2 + alignment[:, i:i+1] # print(sum2) # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip") ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites. ### otherwise, copy the gene to the new folder. if len(unique_wrong_sites) > 0: print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")) cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}") cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col print(cmd) os.system(cmd) else: cmd_2 = "cp " + fasta_name + " " + output_directory_file print(cmd_2) os.system(cmd_2)
% (repr(given_alpha), t_filename) except ValueError: # Good - should fail pass h.close() del good, bad, given_alpha, base_alpha if t_alignment: print("Testing reading %s format file %s as an alignment" % (t_format, t_filename)) alignment = MultipleSeqAlignment(SeqIO.parse( handle=t_filename, format=t_format)) assert len(alignment) == t_count alignment_len = alignment.get_alignment_length() # Check the record order agrees, and double check the # sequence lengths all agree too. for i in range(t_count): assert compare_record(records[i], alignment[i]) assert len(records[i].seq) == alignment_len print(alignment_summary(alignment)) # Some alignment file formats have magic characters which mean # use the letter in this position in the first sequence. # They should all have been converted by the parser, but if # not reversing the record order might expose an error. Maybe. records.reverse() check_simple_write_read(records)
def complete_from_consensus(true_seq_aln, cod_align, edited_pos, gcode={}, only_ed_G=False): true_seq_aln = MultipleSeqAlignment(list(true_seq_aln)) summary_align = AlignInfo.SummaryInfo(true_seq_aln) consensus = summary_align.dumb_consensus(threshold=0.5) for pos in range(true_seq_aln.get_alignment_length()): cons_aa = consensus[pos] known_ed_col = at_least_one_ed(edited_pos, pos) if known_ed_col is not None: for seqrec in true_seq_aln: ctable = CodonTable.unambiguous_dna_by_id[gcode.get( seqrec.name, 1)] seq_aa = seqrec[pos] ed_allowed = (only_ed_G and len(edited_pos.get( seqrec.name, [])) > 0) or not only_ed_G nuc_pos = len(str(seqrec[:pos + 1].seq).replace("-", "")) in_pos = [ x[-1] for x in edited_pos[seqrec.name] if x[-1] // 3 == pos ] if in_pos: wg_pos = in_pos[0] # Here we attempt to slightly correct a wrong position wg_cod = [ x for x in str(cod_align[seqrec.name][pos * 3:pos * 3 + 3].seq) ] if not wg_cod[wg_pos % 3] == 'C': edited_pos[seqrec.name] = [ x for x in edited_pos[seqrec.name] if x != wg_pos ] else: wg_cod[wg_pos % 3] = 'T' wg_aa = ctable.forward_table.get("".join(wg_cod), 'X') if wg_aa != 'X' and wg_aa != cons_aa: edited_pos[seqrec.name] = [ x for x in edited_pos[seqrec.name] if x != wg_pos ] #print(seqrec.description, pos, pos*3, ed_allowed, known_ed_col, cons_aa, seq_aa, str(cod_align[seqrec.name][pos*3: pos*3+3].seq)) if ed_allowed and not in_pos: codon = [ x for x in str(cod_align[seqrec.name][pos * 3:pos * 3 + 3].seq) ] cmut = [x for x in codon] cmut[known_ed_col] = 'T' if codon[known_ed_col] == 'C' and ctable.forward_table.get( "".join(cmut), 'X') == cons_aa and cons_aa != 'X': edited_pos[seqrec.name].append( (nuc_pos * 3 + known_ed_col, pos * 3 + known_ed_col)) else: editerator = editing_yielder(codon) while True: try: codmut = next(editerator) if codmut and ctable.forward_table.get( codmut, 'X') == cons_aa: edited_pos[seqrec.name].extend([ (npos + nuc_pos * 3, pos * 3 + npos) for npos, mnuc in enumerate(codmut) if codon[npos] != mnuc ]) break except StopIteration: break for k, v in edited_pos.items(): edited_pos[k] = list(sorted(set(v))) return edited_pos