Пример #1
0
 def check_matching(self) -> tuple:
     init_clstr = defaultdict(list)
     dichord_list = []
     converted_names = {}
     related_aligns = None
     for n, tip in enumerate(self.tree.get_terminals()):
         tip_name = tip.name
         try:
             seq_record = self.aligns_as_seqs[tip_name]
             dichord = TipSeqLinker(
                 seq_record,
                 (self.tree.root, *self.tree.get_path(tip))
             )
         except KeyError:
             raise TipNotMatchedError(tip)
         init_clstr[tip].append(dichord)
         dichord_list.append(dichord)
         new_seq_id = 'seq{}'.format(n)
         converted_names[tip_name] = new_seq_id
         converted_names[new_seq_id] = tip_name
         if related_aligns is None:
             related_aligns = MultipleSeqAlignment([seq_record])
         else:
             related_aligns.extend([seq_record])
     return (
         init_clstr, dichord_list, converted_names,
         tuple(range(related_aligns.get_alignment_length()))
     )
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
            SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha),
                      id="ID001"),
            SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha),
                      id="ID002"),
            SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha),
                      id="ID003")
        ])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(
            str(m),
            """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Пример #3
0
def removecolumnfrommask(seqfile, filetype, mask):
    outFile = open(seqfile.split('.')[0] + '_masked.fas', 'w+')
    alignment = AlignIO.read(seqfile, filetype)
    trimAlign = MultipleSeqAlignment([])
    numCol = alignment.get_alignment_length()
    colToKeep = []
    coltoremove = []

    for k in open(mask, 'r'):
        coltoremove.append(int(k.split('\n')[0]))
    print(len(coltoremove))

    for i in range(numCol):
        if i not in coltoremove:
            colToKeep.append(i)
    print(len(colToKeep))
    print('if okay remove+keep (', int(len(coltoremove) + len(colToKeep)),
          ') match ', int(numCol))
    for record in alignment:
        newseq = ""
        for j in colToKeep:
            newseq = newseq + (record[j])

        newRecord = SeqRecord(Seq(newseq), id=record.id)
        trimAlign.append(newRecord)
        if 'SWARM' in record.id:
            outFile.write('>' + record.id.split('_')[0] + '\n' + newseq + '\n')
        else:
            outFile.write('>' + record.id + '\n' + newseq + '\n')
    outFile.close()
    print("Total number of columns remaining: %i" %
          trimAlign.get_alignment_length())
Пример #4
0
 def test_basic_alignment(self):
     """Basic tests on a simple alignment of three sequences."""
     alignment = MultipleSeqAlignment([])
     letters = "AbcDefGhiJklMnoPqrStuVwxYz"
     alignment.append(SeqRecord(Seq(letters), id="mixed"))
     alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
     alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
     self.assertEqual(alignment.get_alignment_length(), 26)
     self.assertEqual(len(alignment), 3)
     self.assertEqual(str(alignment[0].seq), letters)
     self.assertEqual(str(alignment[1].seq), letters.lower())
     self.assertEqual(str(alignment[2].seq), letters.upper())
     self.assertEqual(alignment[0].id, "mixed")
     self.assertEqual(alignment[1].id, "lower")
     self.assertEqual(alignment[2].id, "upper")
     for (col, letter) in enumerate(letters):
         self.assertEqual(alignment[:, col],
                          letter + letter.lower() + letter.upper())
     # Check row extractions:
     self.assertEqual(alignment[0].id, "mixed")
     self.assertEqual(alignment[-1].id, "upper")
     # Check sub-alignment extraction by row slicing:
     self.assertIsInstance(alignment[::-1], MultipleSeqAlignment)
     self.assertEqual(alignment[::-1][0].id, "upper")
     self.assertEqual(alignment[::-1][2].id, "mixed")
Пример #5
0
    def test_proteins(self):
        a = MultipleSeqAlignment([
            SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"),
            SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"),
            SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003")
        ])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c)
        self.assertEqual(
            str(m),
            """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        letters = IUPACData.protein_letters
        base_freq = 1.0 / len(letters)
        e_freq_table = {letter: base_freq for letter in letters}
        ic = s.information_content(e_freq_table=e_freq_table,
                                   chars_to_ignore=["-", "*"])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Пример #6
0
    def __init__(self, alignment: MultipleSeqAlignment):
        """
        Constructor.

        alignment: MultipleSeqAlignment containing the alignment
        """
        self.tree = None  # Best tree
        self.trees = []  # Complete trees
        self.threshold = np.inf  # The score of the currently best tree

        # Create the dictionary (and list) of taxa
        self.leaves = []  # Leaves as clades

        # For every alignment, create corresponding leaves and sets.
        for sequence in alignment:

            # Create a list of sets containing bases
            base_sets = []
            for char in sequence.seq:
                base_sets.append({char})

            # Create leaf
            leaf = ParsimonyClade(None, sequence.id, sets=base_sets, score=0)
            self.leaves.append(leaf)

        self.size = len(self.leaves)  # Number of alignments
        self.length = alignment.get_alignment_length(
        )  # Length of each sequence
    def __init__(self, alignment: MultipleSeqAlignment, seed: int = 0):
        """
        Constructor.
        
        alignment: MultipleSeqAlignment containing the alignment
        """

        self.seed = seed
        np.random.seed(seed)

        self.size = len(alignment)  # Number of alignments
        self.length = alignment.get_alignment_length(
        )  # Length of each sequence
        self.nr_of_bases = len(Base) + 1
        self.alignment = alignment
        self.threshold = 10e-5
        self.E = 10e-5

        # If the number of sequences isn't enough
        if self.size <= 0:
            raise ValueError("There aren't enough taxa.")

        # If there is only one taxon
        if self.size == 1:
            inner_clade = SOTAClade(None, None)
            first_cell = SOTAClade(None, alignment[0].id)
            inner_clade.clades.append(first_cell)
            self.tree = self.create_tree(inner_clade)

        # If there are only 2 taxa
        elif self.size == 2:
            inner_clade = SOTAClade(None, None)
            first_cell = SOTAClade(None, alignment[0].id)
            second_cell = SOTAClade(None, alignment[1].id)
            inner_clade.clades.append(first_cell)
            inner_clade.clades.append(second_cell)
            self.tree = self.create_tree(inner_clade)

        # In any other case
        else:
            # Sequences to classify. Dimension: number of taxa, number of different bases + 1, length of sequences
            self.S = np.zeros((self.size, self.nr_of_bases, self.length))
            self.names = [
            ]  # Names of animal species belonging to the sequences.

            # For every alignment, do the corresponding coding.
            for i in range(self.size):
                sequence = self.alignment[i]
                self.names.append(sequence.id)  # Store name

                for j in range(self.length):
                    char = sequence.seq[j]
                    try:
                        base = Base[char]
                        self.S[i][base.value, j] = 1
                    except KeyError:
                        self.S[i][self.nr_of_bases - 1, j] = 1
Пример #8
0
    def test_proteins(self):
        alpha = HasStopCodon(Gapped(generic_protein, "-"), "*")
        a = MultipleSeqAlignment([
                SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"),
                SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"),
                SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")])
        self.assertEqual(32, a.get_alignment_length())

        s = SummaryInfo(a)

        c = s.dumb_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*")

        c = s.gap_consensus(ambiguous="X")
        self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX")

        m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
        self.assertEqual(str(m), """    A   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   W   Y
M  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
H  0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0
X  2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
F  0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
L  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
K  0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
G  0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Q  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
I  0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
R  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0
S  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0
P  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0
E  0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Y  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0
X  0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
W  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
X  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
""")

        ic = s.information_content(chars_to_ignore=['-', '*'])
        self.assertAlmostEqual(ic, 133.061475107, places=6)
Пример #9
0
 def split_alignment(clc, alignment, genelimit):
     """Split a multiple sequence alignment into a dict of sequences"""
     # genelimit convert:
     sequences = {}
     if isinstance(alignment, dict):
         alignment = MSA(alignment.values())
     exp_len = alignment.get_alignment_length()
     for dt in genelimit:
         gene, start, end = dt
         sequences[gene] = alignment[:, start:end]
         exp_len -= sequences[gene].get_alignment_length()
     if exp_len != 0:
         raise ValueError("Could not split alignment, wrong gene delimiter")
     return sequences
Пример #10
0
def maskalignment(arg, percent, percentmissing, filetype):
    maskedcolumn = open(
        arg.split('.')[0] + '_mask_' + str(percentmissing) + '.txt', 'w+')
    outFile = open(
        arg.split('.')[0] + '_masked_' + str(percentmissing) + '.fas', 'w+')
    checkgap = open(arg.split('.')[0] + '_missingcharacter.txt', 'w+')
    alignment = AlignIO.read(arg, filetype)
    trimAlign = MultipleSeqAlignment([])
    numRows = len(alignment)
    x = float(percent) * float(numRows) / 100.0
    numGap = numRows - float(x)
    numCol = alignment.get_alignment_length()

    print("Total number of rows: %i" % numRows)
    print("Number of gapped sequences allowed at a given site: %i" % numGap)
    print("Total number of columns: %i" % numCol)
    checkgap.write("Total number of rows: \t" + str(numRows) +
                   '\nNumber of gapped sequences allowed at a given site: \t' +
                   str(numGap) + '\n Total number of columns: \t' +
                   str(numCol) + '\n\n cutoff : \t' + str(x) + '\n\n\n')
    checkgap.write("Position \t Missing Characters \t Characters \n")
    my_array = {}
    colToKeep = []
    for i in range(numCol):
        #print i
        lineName = "line_" + str(i)
        my_array[lineName] = alignment[:, i]
        chapre = int(numRows) - int(my_array[lineName].count('-'))
        checkgap.write(
            str(i) + '\t' + str(my_array[lineName].count('-')) + '\t' +
            str(chapre) + '\n')
        if my_array[lineName].count('-') > numGap:
            print("get rid of column %i" % i)
            maskedcolumn.write(str(i) + '\n')
        else:
            colToKeep.append(i)

    for record in alignment:
        newseq = ""
        for i in colToKeep:
            newseq = newseq + (record[i])

        newRecord = SeqRecord(Seq(newseq), id=record.id)
        trimAlign.append(newRecord)
        outFile.write('>' + record.id + '\n' + newseq + '\n')

    print("Total number of columns remaining: %i" %
          trimAlign.get_alignment_length())
Пример #11
0
def count_mismatches(path, align):
    """
    Calculate the amout of mismatches along a path
    """
    names = [x[0] for x in path]
    sliced_align = MultipleSeqAlignment(
        [rec for rec in align if rec.name in names])
    i = 0
    j = 0
    for idx in range(sliced_align.get_alignment_length()):
        column = sliced_align[:, idx]
        M = [x for x in column if x != "-"]
        N = set(M)
        if len(M) > 1:
            i += 1
        if len(N) > 1:
            j += 1
    return i, j
Пример #12
0
def remove_invariant_sites(input_align):
    """
    removes invariant sites from an alignment
    :param input_align: Seq object = fasta of SNPs
    :return: cleaned fasta
    """
    inv = [
        SeqRecord(Seq('', s.seq.alphabet), id=s.id, description=s.description)
        for s in input_align
    ]
    inv = MultipleSeqAlignment(inv)
    print("input alignment has %i columns" %
          input_align.get_alignment_length())

    for i in range(input_align.get_alignment_length()):
        if not is_invariant(input_align[:, i]):
            # add invariant column to alignment alig[:,i:i+1]
            inv = inv + input_align[:, i:i + 1]

    print("edited alignment has %i columns" % inv.get_alignment_length())
    return inv
Пример #13
0
def get_validation_label(domain_sid1: str, domain_sid2: str, aligns,
                         pssm_dir: str):
    pssm1 = parse_pssm(f'{pssm_dir}/{domain_sid1[2:4]}/{domain_sid1}.mtx')
    pssm2 = parse_pssm(f'{pssm_dir}/{domain_sid2[2:4]}/{domain_sid2}.mtx')
    msa = MultipleSeqAlignment([
        aligns[f'{domain_sid1}&{domain_sid2}'],
        aligns[f'{domain_sid2}&{domain_sid1}']
    ])
    assert len(pssm1.pssm) == len(msa[0].seq.ungap('-')) and len(
        pssm2.pssm) == len(msa[1].seq.ungap('-'))
    Y = np.zeros((len(pssm1.pssm), len(pssm2.pssm)), dtype=np.int8)
    x, y = 0, 0
    for i in range(msa.get_alignment_length()):
        if msa[0][i] == "-":
            y += 1
        elif msa[1][i] == "-":
            x += 1
        else:
            Y[x, y] = 1
            x += 1
            y += 1

    return Y
Пример #14
0
def make_consensus(path, align):
    """
    Concatenate sequences in path.
    Differences are resolved by taking the nucleotide in majority within the column.
    """
    #  Alignment of sequences in the path
    path_aln = MultipleSeqAlignment([rec for rec in align if rec.name in path])

    # Alignment of sequences NOT in the path
    no_path_aln = MultipleSeqAlignment(
        [rec for rec in align if rec.name not in path])

    consensus_sequence = ""
    for idx in range(path_aln.get_alignment_length()):
        path_col = path_aln[:, idx]
        ambiguity = set([x for x in path_col if x != "-"])
        if len(ambiguity) > 1:
            no_path_col = [x for x in no_path_aln[:, idx] if x != "-"]
            if no_path_col:
                c = collections.Counter(sorted(no_path_col))
                fq = sorted(
                    [(x, c[x] / len(no_path_col)) for x in c.keys()],
                    key=lambda x: x[1],
                    reverse=True,
                )

                shared_major_base = set([x[0] for x in fq if x[1] > 0.25
                                         ]).intersection(ambiguity)
                ambiguity_up = set([x.upper() for x in ambiguity])

                #  Case 1 one of the two variants is among the dominant bases,
                #    ambiguity is resolved by using the dominant base
                if shared_major_base and len(shared_major_base) == 1:
                    consensus_sequence += list(shared_major_base)[0]

                #  Case 2 none of the variants are among the dominant bases or
                #   both of them are among the dominant bases,
                #   then return the consensus
                elif ambiguity_up in [
                        set(["A", "G"]),
                        set(["C", "T"]),
                        set(["G", "C"]),
                        set(["A", "T"]),
                        set(["G", "T"]),
                        set(["A", "C"]),
                        set(["C", "G", "T"]),
                        set(["A", "G", "T"]),
                        set(["A", "C", "T"]),
                        set(["A", "C", "G"]),
                        set(["A", "T", "C", "G"]),
                ]:
                    iupac_code = {
                        ("A", "G"): "R",
                        ("C", "T"): "Y",
                        ("G", "C"): "S",
                        ("A", "T"): "W",
                        ("G", "T"): "K",
                        ("A", "C"): "M",
                        ("C", "G", "T"): "B",
                        ("A", "G", "T"): "D",
                        ("A", "C", "T"): "H",
                        ("A", "C", "G"): "V",
                        ("A", "T", "C", "G"): "N",
                    }
                    iupac_ambiguity = [
                        iupac_code[x] for x in iupac_code.keys()
                        if set(x).intersection(ambiguity_up) == set(x)
                    ][0]
                    consensus_sequence += iupac_ambiguity

                # Case 3 the alleles already contain ambiguities
                else:
                    consensus_sequence += "N"
        else:
            if [x for x in path_col if x != "-"]:
                consensus_sequence += [x for x in path_col
                                       if x != "-"][0].replace("N", "-")
            else:
                consensus_sequence += "-"
    new_name = path[0].split("|")[0] + "|" + "|".join(
        [x.split("|")[1] for x in path])

    return (new_name, consensus_sequence)
Пример #15
0

# In[12]:

xcount = [str(p.seq).count('X') for p in pralign]
xlist = list(set(xcount))
xfreq = [xcount.count(x) for x in xlist]

# In[32]:

# Exclude sequences
trimalign2 = [trimalign[i,:] for i in range(len(pralign)) if xcount[i] <= xthresh]
trimalign2 = MultipleSeqAlignment(records=trimalign2,alphabet=Gapped(IUPACAmbiguousDNA(),"-"))
#trimalign2 = trimalign

# In[46]:

# Count gaps, then go in reverse until the threshold is first crossed
ta2l = trimalign2.get_alignment_length()
ta2n = len(trimalign2)
gapcount = [trimalign2[:,i].count("-") for i in range(ta2l)]
for i in reversed(range(ta2l)):
    if ta2n-gapcount[i] >= round(ta2n*gapthresh):
        break
trimalign3 = trimalign2[:,:(i+1)]


# In[47]:

AlignIO.write(trimalign3,ofn,"fasta")
Пример #16
0
Given a multiple sequence alignment, calculate the identity for each pair of
sequences.
"""
import argparse
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("alignment", help="Multiple sequence alignment input in FASTA format")
    args = parser.parse_args()

    # Load the multiple sequence alignment and sort records by name in ascending
    # order.
    original_alignment = AlignIO.read(args.alignment, "fasta")
    alignment = MultipleSeqAlignment(sorted([record for record in original_alignment], key=lambda record: record.name))

    for sequence_j in alignment:
        for sequence_k in alignment:
            if sequence_j != sequence_k:
                total = 0
                matches = 0
                for i in xrange(alignment.get_alignment_length()):
                    total += 1

                    if sequence_j[i].upper() == sequence_k[i].upper():
                        matches += 1

                print "\t".join((sequence_j.name, sequence_k.name, str(matches / float(total))))
import os

# biopython
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.SubsMat import FreqTable
from Bio.Align import MultipleSeqAlignment

# Very simple tests on an empty alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert len(alignment) == 0
del alignment

# Basic tests on simple three string alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.append(SeqRecord(Seq(letters), id="mixed"))
alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
assert alignment.get_alignment_length() == 26
assert len(alignment) == 3
assert str(alignment[0].seq) == letters
assert str(alignment[1].seq) == letters.lower()
assert str(alignment[2].seq) == letters.upper()
assert alignment[0].id == "mixed"
Пример #18
0
    # Load the multiple sequence alignment and sort records by name in ascending
    # order.
    original_alignment = AlignIO.read(args.alignment, "fasta")
    alignment = MultipleSeqAlignment(
        sorted([record for record in original_alignment],
               key=lambda record: record.name))

    index_by_column_type = {}
    current_index = 0

    with open(args.classified_alignment_positions, "w") as fh:
        writer = csv.writer(fh, delimiter="\t", lineterminator="\n")
        writer.writerow(("position", "column_type", "bases"))

        for i in xrange(alignment.get_alignment_length()):
            # First enumerate bases in the given column to determine the column's
            # "type" (e.g., all bases are the same, all bases are different, etc.).
            enumerated_bases = enumerate_bases(
                [alignment[j][i] for j in xrange(len(alignment))])

            # Then enumerate this type of column in the context of the alignment
            # such that each column type gets its own integer that summarizes that
            # alignment position.
            if enumerated_bases not in index_by_column_type:
                index_by_column_type[enumerated_bases] = current_index
                current_index += 1

            if types is None or enumerated_bases in types:
                writer.writerow((i, index_by_column_type[enumerated_bases],
                                 enumerated_bases))
Пример #19
0
class sequence_set(object):
    """sequence_set subsamples a set of sequences, aligns them and exports variability statistics"""
    def __init__(self, fname, reference= None, **kwarks):
        super(sequence_set, self).__init__()
        self.nthreads = 2
        if os.path.isfile(fname):
            with myopen(fname) as seq_file:
                self.raw_seqs = {fix_names(x.description):x for x in SeqIO.parse(seq_file, 'fasta')}
                for x in self.raw_seqs.values():
                    x.id = fix_names(x.id)
                    x.name = fix_names(x.id)
                    x.description = fix_names(x.description)
        if 'run_dir' not in kwarks:
            import random
            self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
        else:
            self.run_dir = kwarks['run_dir']

        if reference is not None:
            if type(reference) is str and fix_names(reference) in self.raw_seqs:
                self.reference = self.raw_seqs[fix_names(reference)]
            else:
                self.reference = reference
        else: self.reference=None

    def parse(self, fields, sep='|', strip='_'):
        '''
        split the sequence description and add annotations to sequences
        '''
        for seq in self.raw_seqs.values():
            if not hasattr(seq, "attributes"): seq.attributes = {}
            words = map(lambda x:x.strip(strip),seq.description.replace(">","").split(sep))
            for ii, val in enumerate(words):
                if ii in fields:
                    if val not in ["", "-"]:
                        seq.attributes[fields[ii]] = val
                    else:
                        seq.attributes[fields[ii]] = ""

    def ungap(self):
        '''
        remove previously existing gaps and make sure all sequences are upper case
        '''
        for seq in self.raw_seqs.values():
            seq.seq = seq.seq.ungap('-').upper()

    def parse_date(self, fmts, prune=True):
        if not hasattr(self.raw_seqs.values()[0], "attributes"):
            print("parse meta info first")
            return
        from datetime import datetime
        for seq in self.raw_seqs.values():
            if 'date' in seq.attributes and seq.attributes['date']!='':
                for fmt in fmts:
                    try:
                        if callable(fmt):
                            tmp = fmt(seq.attributes['date'])
                        else:
                            tmp = datetime.strptime(seq.attributes['date'], fmt).date()
                        seq.attributes['raw_date'] = seq.attributes['date']
                        seq.attributes['num_date'] = num_date(tmp)
                        seq.attributes['date']=tmp
                        break
                    except:
                        continue

        if prune:
            self.raw_seqs = {k:v for k,v in self.raw_seqs.iteritems()
                            if 'date' in v.attributes and type(v.attributes['date'])!=str}

    def filter(self, func):
        self.raw_seqs = {key:seq for key, seq in self.raw_seqs.iteritems() if func(seq)}

    def clock_filter(self, root_seq=None, n_iqd=3, max_gaps = 1.0, plot=False):
        '''
        remove sequences form the set that are that evolve much faster or slower
        compared the majority. Regions with predominantly gaps can be removed since
        this can skew the evolutionary rates.
        '''
        from Bio.Align import MultipleSeqAlignment
        if root_seq is None: # use consensus
            af = calc_af(self.aln, nuc_alpha)
            root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)]
        if type(root_seq)==str and root_seq in self.sequence_lookup:
            root_seq = np.array(self.sequence_lookup[root_seq])
        if max_gaps<1.0:
            af=calc_af(self.aln, nuc_alpha)
            good_pos = af[nuc_alpha.index('-')]<max_gaps
        else:
            good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool)
        date_vs_distance = {}
        for seq in self.aln:
            date_vs_distance[seq.id] = (seq.attributes['num_date'],
                np.mean((np.array(seq)!=root_seq)[(np.array(seq)!='-')&(root_seq!='-')&good_pos]))
        date_vs_distance_array=np.array(date_vs_distance.values())
        from scipy.stats import linregress, scoreatpercentile
        slope, intercept, rval, pval, stderr = linregress(date_vs_distance_array[:,0], date_vs_distance_array[:,1])
        print("distance vs time regression:",slope)
        residuals = (intercept + slope*date_vs_distance_array[:,0]) - date_vs_distance_array[:,1]
        IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(residuals,25)
        if plot:
            import matplotlib.pyplot as plt
            plt.ion()
            plt.scatter(date_vs_distance_array[:,0], date_vs_distance_array[:,1], c='g')
            bad_points = abs(intercept+slope*date_vs_distance_array[:,0] - date_vs_distance_array[:,1])>n_iqd*IQD
            plt.scatter(date_vs_distance_array[bad_points,0], date_vs_distance_array[bad_points,1], c='r')


        print("before clock filter:",len(self.aln))
        self.aln = MultipleSeqAlignment([seq for seq in self.aln
                    if abs(intercept+slope*date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1])<n_iqd*IQD])
        print("after clock filter:",len(self.aln))

    def subsample(self, category=None, priority=None, threshold=None, repeated=False):
        '''
        produce a useful set of sequences from the raw input.
        arguments:
        category  -- callable that assigns each sequence to a category for subsampling
        priority  -- callable that assigns each sequence a priority to be included in
                     the final sample. this is applied independently in each category
        threshold -- callable that determines the number of sequences from each category
                     that is included in the final set. takes arguments, cat and seq
                     alternatively can be an int
        '''
        if category is None:
            category = lambda x:(x.attributes['date'].year, x.attributes['date'].month)
        if priority is None:
            priority = lambda x:np.random.random()
        if threshold is None:
            threshold = lambda x:5
        elif type(threshold) is int:
            print("using threshold:",threshold)
            tmp = threshold
            threshold = lambda x:tmp

        self.sequence_categories = defaultdict(list)
        if repeated:
            seqs_to_subsample = self.seqs.values()
        else:
            seqs_to_subsample = self.raw_seqs.values()

        for seq in seqs_to_subsample:
            seq._priority = priority(seq)
            self.sequence_categories[category(seq)].append(seq)

        self.seqs = {}
        for cat, seqs in self.sequence_categories.iteritems():
            seqs.sort(key=lambda x:x._priority, reverse=True)
            self.seqs.update({seq.id:seq for seq in seqs[:threshold( (cat, seqs) )]})

        if self.reference.id not in self.seqs:
            self.seqs[self.reference.id] = self.reference

    def align(self):
        from Bio import AlignIO
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        SeqIO.write(self.seqs.values(), "temp_in.fasta", "fasta")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta")

        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        self.sequence_lookup = {seq.id:seq for seq in self.aln}
        self.reference_aligned = self.sequence_lookup[self.reference.id]
        # add attributes to alignment
        for seqid, seq in self.seqs.iteritems():
            self.sequence_lookup[seqid].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)

    def codon_align(self, alignment_tool="mafft", prune=True):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO,SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune==False:
                aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id)
            else:
                print(seq.id,"has premature stops, discarding")

        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname,'fasta')

        if alignment_tool=='muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta")
        elif alignment_tool=='mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:',alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id:seq for seq in self.aln}
        self.reference_aligned = self.sequence_lookup[self.reference.id]
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)


    def strip_non_reference(self):
        ungapped = np.array(self.reference_aligned)!='-'
        from Bio.Seq import Seq
        for seq in self.aln:
            seq.seq = Seq("".join(np.array(seq)[ungapped]))

    def diversity_statistics(self):
        ''' calculate alignment entropy of nucleotide and optionally protein alignments '''
        if not hasattr(self, "aln"):
            print("calculate alignment first")
            return
        aln_array = np.array(self.aln)
        self.af = {'nuc': calc_af(self.aln, nuc_alpha)}
        tmp_af = self.af['nuc'][:-2]/self.af['nuc'][:-2].sum(axis=0)
        self.entropy ={'nuc': -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)}

        if hasattr(self, "translations"):
            for prot, aln in self.translations.iteritems():
                self.af[prot] = calc_af(aln, aa_alpha)
                tmp_af = self.af[prot][:-2]/self.af[prot][:-2].sum(axis=0)
                self.entropy[prot] = -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)

    def translate(self, proteins=None):
        from Bio.SeqFeature import FeatureLocation
        from Bio.Seq import Seq
        from Bio.Align import MultipleSeqAlignment
        if not hasattr(self, "proteins"): # generate dictionaries to hold annotation and translation
            self.translations={}
            self.proteins={}

        if proteins is None: # add a default translation of the entire sequence unless otherwise specified
            self.proteins.update({'cds':FeatureLocation(start=0, end=self.aln.get_alignment_length(), strand=1)})
        else:
            self.proteins.update(proteins)

        for prot in self.proteins:
            aa_seqs = []
            for seq in self.aln:
                try:
                    # soon not needed as future biopython version will translate --- into -
                    tmpseq = self.proteins[prot].extract(seq)
                    tmpseq.attributes = seq.attributes
                    tmpseq.seq = Seq(str(Seq(str(tmpseq.seq).replace('---', 'NNN')).translate()).replace('X','-'))
                except:
                    tmpseq.seq = Seq(str(Seq("".join([x if x in 'ACGT' else 'N' for x in str(tmpseq.seq)])).translate()).replace('X','-'))
                    print("Trouble translating",seq.id)
                    #import ipdb; ipdb.set_trace()
                aa_seqs.append(tmpseq)
            self.translations[prot] = MultipleSeqAlignment(aa_seqs)

    def export_diversity(self, fname = 'entropy.json'):
        if not hasattr(self, "entropy"):
            self.diversity_statistics()
        entropy_json = {}
        for feat in self.entropy:
            S = [max(0,round(x,4)) for x in self.entropy[feat]]
            n = len(S)
            if feat=='nuc':
                entropy_json[feat] = {'pos':range(0,n), 'codon':[x//3 for x in range(0,n)], 'val':S}
            else:
                entropy_json[feat] = {'pos':[x for x in self.proteins[feat]][::3],
                                      'codon':[(x-self.proteins[feat].start)//3 for x in self.proteins[feat]][::3], 'val':S}
        write_json(entropy_json, fname, indent=None)
Пример #20
0
 def test_empty_alignment(self):
     """Very simple tests on an empty alignment."""
     alignment = MultipleSeqAlignment([])
     self.assertEqual(alignment.get_alignment_length(), 0)
     self.assertEqual(len(alignment), 0)
                site.count('N') + site.count('n') +
                site.count('-'))  #	count gaps and missing data
            pcGap_s = nGap_s / nSeq * 100
            if pcGap_s > maxpcGap_s:
                delsites.append(i)
            #	if proportion of seqs in the column with missing data is above threshold, delete column

        aln_a = np.delete(aln_a, delsites, 1)

        c = 0
        for current_seq in aln:
            filt1_aln.add_sequence(current_seq.id,
                                   ''.join(map(str, list(aln_a[c, ]))))
            c += 1

        length = filt1_aln.get_alignment_length(
        )  # if length of alignment after column-wise filter is above threshold, continue
        if length >= minLen:
            filt2_aln = MultipleSeqAlignment([])
            for current_seq in filt1_aln:  #	for each sequence
                seq = str(current_seq.seq)
                nGap = float(seq.count('N') + seq.count('n') + seq.count('-'))
                pcGap = nGap / length * 100
                if pcGap < maxpcGap:  #	if proportion of missing data in seq is below threshold, print to filtered alignment
                    filt2_aln.add_sequence(current_seq.id,
                                           str(current_seq.seq))
            filt3_aln = MultipleSeqAlignment([])
            for current_seq in filt2_aln:  #	for each sequence
                seq = str(current_seq.seq)
                nHet = float(
                    seq.count('W') + seq.count('w') + seq.count('S') +
                    seq.count('s') + seq.count('M') + seq.count('m') +
Пример #22
0
                              given_alpha).next()
            assert False, "Forcing wrong alphabet, %s, should fail (%s)" \
                   % (repr(given_alpha), t_filename)
        except ValueError:
            pass
    del good, bad, given_alpha, base_alpha

    if t_alignment:
        print "Testing reading %s format file %s as an alignment" \
              % (t_format, t_filename)

        alignment = MultipleSeqAlignment(SeqIO.parse( \
                    handle=open(t_filename,mode), format=t_format))
        assert len(alignment) == t_count

        alignment_len = alignment.get_alignment_length()

        #Check the record order agrees, and double check the
        #sequence lengths all agree too.
        for i in range(t_count):
            assert compare_record(records[i], alignment[i])
            assert len(records[i].seq) == alignment_len

        print alignment_summary(alignment)

    #Some alignment file formats have magic characters which mean
    #use the letter in this position in the first sequence.
    #They should all have been converted by the parser, but if
    #not reversing the record order might expose an error.  Maybe.
    records.reverse()
    check_simple_write_read(records)
Пример #23
0
import os

# biopython
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.SubsMat import FreqTable
from Bio.Align import MultipleSeqAlignment

# Very simple tests on an empty alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert len(alignment) == 0
del alignment

# Basic tests on simple three string alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.append(SeqRecord(Seq(letters), id="mixed"))
alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
assert alignment.get_alignment_length() == 26
assert len(alignment) == 3
assert str(alignment[0].seq) == letters
assert str(alignment[1].seq) == letters.lower()
assert str(alignment[2].seq) == letters.upper()
assert alignment[0].id == "mixed"
Пример #24
0
class DisplayedAlignment(object):
    """
    Provides tools for displaying and manipulating an alignment and storing all previous versions
    """
    def __init__(self, alignment):
        self.displayedColumn = 0
        self.alignment = alignment
        self.alignmentHistory = [alignment[:, :]]
        self.changed = False
        self.translated = False
        self.translationTable = 1

    def ParseIndex(self, text):
        """
        Parses a text string specifying a range of rows (taxa) and columns.  Expects the text to be in the format used to specify a range from a Bio.Align.MultipleSeqAlignment.  Returns indices for the start and stop taxon and the start and stop columns
        """
        taxonStart = 0
        taxonStop = len(self.alignment) - 1
        columnStart = 0
        columnStop = self.alignment.get_alignment_length() - 1
        if (',' not in text):
            self.AlertMessage(
                'Invalid index format.  (taxa or columns missing)', 'high')
            return (-1, -1, -1, -1)
        else:
            text = text.strip()
            indices = text.split(',')
            if (len(indices) > 2):
                self.AlertMessage('Invalid index format.  (too many fields)',
                                  'high')
                return (-1, -1, -1, -1)
            else:
                if (':' in indices[0]
                    ):  #there is a range specified in the taxon index
                    taxonIndices = indices[0].split(':')
                    if (taxonIndices[0]):  #a start taxon is specified
                        try:
                            taxonStart = int(taxonIndices[0].strip())
                        except:
                            self.AlertMessage(
                                'Invalid index format. (taxon start index not an integer)',
                                'high')
                            return (-1, -1, -1, -1)
                    if (taxonIndices[1]):  #a stop taxon is specified
                        try:
                            taxonStop = int(taxonIndices[1].strip())
                        except:
                            self.AlertMessage(
                                'Invalid index format. (taxon stop index not an integer)',
                                'high')
                            return (-1, -1, -1, -1)
                elif (indices[0]):  #a single taxon is specified
                    try:
                        taxonStart = int(indices[0].strip())
                        taxonStop = int(indices[0].strip())
                    except:
                        self.AlertMessage(
                            'Invalid index format. (taxon start or stop index not an integer)',
                            'high')
                        return (-1, -1, -1, -1)
                if (':' in indices[1]
                    ):  #there is a range specified in the taxon index
                    columnIndices = indices[1].split(':')
                    if (columnIndices[0]):  #a start taxon is specified
                        try:
                            columnStart = int(columnIndices[0].strip())
                        except:
                            self.AlertMessage(
                                'Invalid index format. (column start index not an integer)',
                                'high')
                            return (-1, -1, -1, -1)
                    if (columnIndices[1]):  #a stop taxon is specified
                        try:
                            columnStop = int(columnIndices[1].strip())
                        except:
                            self.AlertMessage(
                                'Invalid index format. (column stop index not an integer)',
                                'high')
                            return (-1, -1, -1, -1)
                elif (indices[1]):  #a single taxon is specified
                    try:
                        columnStart = int(indices[1].strip())
                        columnStop = int(indices[1].strip())
                    except:
                        self.AlertMessage(
                            'Invalid index format. (column start or stop index not an integer)',
                            'high')
                        return (-1, -1, -1, -1)
                if ((0 <= taxonStart <= taxonStop) &
                    (0 <= columnStart <= columnStop)):
                    return (taxonStart, taxonStop, columnStart, columnStop)
                else:
                    self.AlertMessage(
                        'Invalid index range. (start > stop or index < 0)',
                        'high')
                    return (-1, -1, -1, -1)

    def ColorizeDNA(self, text):
        """
        Colorizes output based on nucleotide
        """
        if (text == 'A'):
            escape = '\033[92m'  # Green
        elif (text == 'G'):
            escape = '\033[93m'  # Yellow
        elif (text == 'T'):
            escape = '\033[91m'  # Red
        elif (text == 'C'):
            escape = '\033[96m'  # Blue
        else:
            return text
        return escape + text + '\033[0m'

    def ColorizeAA(self, text):
        """
        Colorize output based on amino acid polarity or nonpolarity
        """
        if (text in ['A', 'F', 'H', 'I', 'K', 'L', 'M', 'P', 'R', 'V', 'W']):
            escape = '\033[91m'  # Red
        elif (text in ['C', 'G', 'N', 'Q', 'S', 'T', 'Y', 'B', 'Z']):
            escape = '\033[96m'  # Blue
        elif (text in ['D', 'E']):
            escape = '\033[92m'  # Green
        elif (text in ['X', '*']):
            escape = '\033[93m'  # Yellow
        else:
            return text
        return escape + text + '\033[0m'

    def AlertMessage(self, text, severity='low'):
        """
        Display an alert message with a tag and color corresponding to the severity of the alert ('low', 'medium', 'high')
        """
        if (severity == 'high'):
            escape = '\033[91m'  # Red
            tag = '!!!'
        elif (severity == 'medium'):
            escape = '\033[93m'  # Yellow
            tag = '***'
        else:
            escape = '\033[92m'  # Green
            tag = '   '
        print escape + tag, text, tag + '\033[0m'

    def Show(self, column=0):
        """
        Displays 100 columns of the alignment beginning at 'column'
        """
        if column < 0:
            column = 0
        row = 0
        marker = '|    :    ' * 10
        spacer = ' ' * 15
        markerRow = spacer + marker
        if (self.translated == False):
            indexRow = spacer
            for index in range(column, column + 100, 10):
                indexRow = indexRow + str(index).ljust(10)
            print indexRow
            print markerRow
            for sequence in self.alignment[:, column:column + 100]:
                print '%2d) %10s' % (row, sequence.id),
                dnaSequence = ''
                for nucleotide in str(sequence.seq):
                    dnaSequence += self.ColorizeDNA(nucleotide)
                print dnaSequence,
                if (column + 100 < self.alignment.get_alignment_length()):
                    print '...'
                else:
                    print
                row += 1
            print markerRow
            print indexRow
        else:
            indexRow = spacer
            for index in range(column / 3, (column / 3) + 100, 10):
                indexRow = indexRow + str(index).ljust(10)
            print indexRow
            print markerRow
            for sequence in self.alignment[:, column:column + 300]:
                proteinSequence = ''
                for codonPosition in range(0, len(sequence), 3):
                    codon = sequence.seq[codonPosition:codonPosition + 3]
                    if (str(codon) == '---'):
                        proteinSequence += '-'
                    elif ('-' in codon):
                        proteinSequence += '?'
                    else:
                        proteinSequence += self.ColorizeAA(
                            str(codon.translate(table=self.translationTable)))
                print '%2d) %10s %s' % (row, sequence.id, proteinSequence),
                if (column + 300 < self.alignment.get_alignment_length()):
                    print '...'
                else:
                    print
                row += 1
            print markerRow
            print indexRow
        self.displayedColumn = column

    def BackupAlignment(self):
        """
        Stores the current alignment state to the alignment change history
        """
        self.alignmentHistory.append(self.alignment[:, :])

    def UndoChanges(self):
        """
        Reverts to the previous state in the alignment change history.  Does not effect which column index is displayed or whether the sequence is displayed as translated or not since those are not changes to the alignment.
        """
        if (len(self.alignmentHistory) > 1):
            self.alignmentHistory.pop()
            self.alignment = self.alignmentHistory[-1][:, :]
            self.Show(self.displayedColumn)
        else:
            self.AlertMessage('Nothing to undo.', 'low')

    def DeleteRange(self, rangeText, silent=False):
        """
        Removes a row and column range from the alignment
        """
        startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(
            rangeText)
        if (self.translated == True):
            startColumn = startColumn * 3
            stopColumn = (stopColumn * 3) + 2
        if (startTaxon >= 0):  #Make sure we had a valid range
            changeLength = 0
            deleteTaxon = False
            if ((startColumn == 0) &
                (stopColumn == len(self.alignment[0]) - 1)):
                deleteTaxon = True
            if ((startTaxon > 0) | (stopTaxon < len(self.alignment) - 1)):
                changeLength = (stopColumn - startColumn) + 1
            taxon = 0
            newSequences = []
            for Sequence in self.alignment:
                if (taxon in range(startTaxon, stopTaxon + 1)):
                    if (not deleteTaxon):
                        if (startColumn > 0):
                            Sequence.seq = Sequence.seq[:
                                                        startColumn] + Sequence.seq[
                                                            stopColumn + 1:]
                        else:
                            Sequence.seq = Sequence.seq[stopColumn + 1:]
                        if (changeLength):
                            Sequence.seq = Sequence.seq + Seq(
                                '-' * changeLength)
                        newSequences.append(Sequence)
                else:
                    newSequences.append(Sequence)
                taxon += 1
            self.alignment = MultipleSeqAlignment(newSequences)
            if (not silent):
                self.Show(self.displayedColumn)
                self.BackupAlignment()

    def ModifyRange(self, rangeText, nucleotide='-'):
        """
        Changes the nucleotides in a row and column range to a specified nucleotide.  Has no effect when the alignment is translated since the corresponding change to the underlying nucleotide alignment would be ambiguous at best.
        """
        nucleotide = nucleotide.upper()
        if (self.translated == True):
            self.AlertMessage("Can't modify protein sequences.", 'medium')
        elif (nucleotide not in [
                'A', 'G', 'C', 'T', 'R', 'K', 'S', 'W', 'M', 'Y', 'D', 'V',
                'B', 'H', 'N', '-'
        ]):
            self.AlertMessage(
                'Invalid nucleotide.  (only AGTC- and IUB nucleotide codes are permitted)',
                'high')
        else:
            startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(
                rangeText)
            if (startTaxon >= 0):  #Make sure we have a valid range
                taxon = 0
                newSequences = []
                modificationLength = (stopColumn - startColumn) + 1
                for Sequence in self.alignment:
                    if (taxon in range(startTaxon, stopTaxon + 1)):
                        if (startColumn > 0):
                            Sequence.seq = Sequence.seq[:startColumn] + Seq(
                                nucleotide *
                                modificationLength) + Sequence.seq[stopColumn +
                                                                   1:]
                        else:
                            Sequence.seq = Seq(
                                nucleotide *
                                modificationLength) + Sequence.seq[stopColumn +
                                                                   1:]
                    newSequences.append(Sequence)
                    taxon += 1
                self.alignment = MultipleSeqAlignment(newSequences)
                self.Show(self.displayedColumn)
                self.BackupAlignment()

    def InsertRange(self, rangeText):
        """
        Inserts a row and column range into the alignment and fills it with gaps ('-' for nucleotides or '---' for translated alignments)
        """
        startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(
            rangeText)
        if (self.translated == True):
            startColumn = startColumn * 3
            stopColumn = (stopColumn * 3) + 2
        if (startTaxon >= 0):  #Make sure we had a valid range
            changeLength = (stopColumn - startColumn) + 1
            taxon = 0
            newSequences = []
            for Sequence in self.alignment:
                if (taxon in range(startTaxon, stopTaxon + 1)):
                    if (startColumn > 0):
                        Sequence.seq = Sequence.seq[:startColumn] + Seq(
                            '-' * changeLength) + Sequence.seq[startColumn:]
                    else:
                        Sequence.seq = Seq(
                            '-' * changeLength) + Sequence.seq[:]
                else:
                    Sequence.seq = Sequence.seq + Seq('-' * changeLength)
                newSequences.append(Sequence)
                taxon += 1
            self.alignment = MultipleSeqAlignment(newSequences)
            self.Show(self.displayedColumn)
            self.BackupAlignment()

    def Jump(self, column):
        """
        Moves the displayed column to a specified column index
        """
        if (self.translated == True):
            column = column * 3
        self.Show(column)

    def ScrollRight(self, offset=100):
        """
        Scroll the display 'offset' columns to the right
        """
        if (self.translated == True):
            offset = offset * 3
        self.Show(self.displayedColumn + offset)

    def ScrollLeft(self, offset=100):
        """
        Scroll the display 'offset' columns to the left
        """
        if (self.translated == True):
            offset = offset * 3
        self.Show(self.displayedColumn - offset)

    def Reverse(self):
        """
        Reverses the order of the columns in the alignment.  Has no effect on translated sequences.
        """
        if (self.translated == False):
            self.alignment = self.alignment[:, ::-1]
            self.Show(self.displayedColumn)
            self.BackupAlignment()
        else:
            self.AlertMessage("Can't reverse protein sequences.", 'medium')

    def Complement(self):
        """
        Give the complement of the alignment.  Has no effect on translated sequences.
        """
        if (self.translated == False):
            for i in range(len(self.alignment)):
                self.alignment[i].seq = self.alignment[i].seq.complement()
            self.Show(self.displayedColumn)
            self.BackupAlignment
        else:
            self.AlertMessage("Can't complement protein sequences.", 'medium')

    def ReverseComplement(self):
        """
        Reverse and complement the alignment.  Has no effect on translated sequences.
        """
        if (self.translated == False):
            for i in range(len(self.alignment)):
                self.alignment[i].seq = self.alignment[
                    i].seq.reverse_complement()
            self.Show(self.displayedColumn)
            self.BackupAlignment()
        else:
            self.AlertMessage("Can't reverse-complement protein sequences.",
                              'medium')

    def Translate(self, translationTable=11):
        """
        Switch to displaying and manipulating the sequence as a protein sequence.
        Still works on translated sequences if a different translation table is specified, otherwise it backtranslated translated sequences.
        """
        if ((self.translated == False) |
            ((self.translated == True) &
             (self.translationTable != translationTable))):
            self.translated = True
            self.translationTable = translationTable
            self.displayedColumn = self.displayedColumn - (
                self.displayedColumn % 3)
            self.Show(self.displayedColumn)
        else:
            self.BackTranslate()

    def BackTranslate(self):
        """
        Revert to displaying and manipulating the sequence as a dna sequence.  Has no effect if the sequence is already dna.
        """
        if (self.translated == True):
            self.translated = False
            self.Show(self.displayedColumn)
        else:
            self.AlertMessage(
                "Can't back-translate.  Alignment contains DNA sequences",
                'medium')

    def Save(self, fileName='alignment.phy', alignmentFormat='phylip'):
        """
        Write alignment to disk
        """
        AlignIO.write(self.alignment, fileName, alignmentFormat)
        self.AlertMessage(
            'Saved alignment to ' + fileName + ' in ' + alignmentFormat +
            ' format.', 'low')

    def CleanUp(self):
        """
        Condense the alignment by removing any columns that contain spaces in all taxa.
        """
        blankColumnPattern = re.compile('^-*$')
        blankColumns = []
        for columnIndex in range(self.alignment.get_alignment_length() - 1):
            columnValues = self.alignment[:, columnIndex]
            match = blankColumnPattern.search(columnValues)
            if (match):
                blankColumns.append(str(columnIndex))
        for column in blankColumns[::-1]:
            self.DeleteRange(',' + str(column), True)
        self.Show(self.displayedColumn)
        self.BackupAlignment()
Пример #25
0
# standard library
import os

# biopython
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.SubsMat import FreqTable
from Bio.Align import MultipleSeqAlignment

#Very simple tests on an empty alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert len(alignment) == 0
del alignment

#Basic tests on simple three string alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.append(SeqRecord(Seq(letters), id="mixed"))
alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
assert alignment.get_alignment_length() == 26
assert len(alignment) == 3
assert str(alignment[0].seq) == letters
assert str(alignment[1].seq) == letters.lower()
assert str(alignment[2].seq) == letters.upper()
assert alignment[0].id == "mixed"
Пример #26
0
class sequence_set(object):

    def __init__(self, logger, sequences, reference, dateFormat):
        super(sequence_set, self).__init__()
        self.log = logger

        # load sequences from the (parsed) JSON - don't forget to sort out dates
        self.seqs = {}
        for name, data in sequences.items():
            self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
                   id=name, name=name, description=name)
            self.seqs[name].attributes = data["attributes"]
            # tidy up dates
            date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat)
            self.seqs[name].attributes["num_date"] = date_struc[1]
            self.seqs[name].attributes["date"] = date_struc[2]

        # if the reference is to be analysed it'll already be in the (filtered & subsampled)
        # sequences, so no need to add it here, and no need to care about attributes etc
        # we do, however, need it for alignment
        self.reference_in_dataset = reference["included"]
        name = reference["strain"]
        self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
               id=name, name=name, description=name)
        if "genes" in reference and len(reference["genes"]):
            self.proteins = {}
            for k, v in reference["genes"].items():
                feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"])

                # Translate sequences to identify any proteins ending with a stop codon.
                translation = Seq.translate(Seq(feature.extract(str(self.reference_seq.seq))))
                if translation.endswith("*"):
                    # Truncate the last codon of the protein to omit the stop codon.
                    feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"])

                self.proteins[k] = feature
        else:
            self.proteins = None

        # other things:
        self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
        self.nthreads = 2 # should load from config file

    def convert_trait_to_numerical_date(self, trait, dateFormat):
        for name, seq in self.seqs.items():
            try:
                date_struc = parse_date(seq.attributes[trait], dateFormat)
                seq.attributes[trait] = date_struc[1]
            except KeyError:
                self.log.warn("Attribute {} not found for sequence {}. Ignoring".format(trait, seq.name))

    def codon_align(self):
        self.log.fatal("Codon align not yet implemented")

    def align(self, verbose, debug=False):
        '''
        align sequences using mafft

        side-effects:
            self.aln {MultipleSeqAlignment} reference not present if not in self.seqs
            self.reference_aln {SeqRecord} always set, even if the reference is subsequently discarded
            self.sequence_lookup {dict} map linking seq.id to the alignment, potentialy without the reference
            saves the alignment (always including reference) to fname
        '''
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        if self.reference_in_dataset:
            out_seqs = self.seqs.values()
        else:
            self.log.notify("Adding reference for alignment step")
            out_seqs = list(self.seqs.values()) + [self.reference_seq]

        SeqIO.write(out_seqs, "temp_in.fasta", "fasta")
        self.log.notify("Running alignment")
        if verbose == 0:
            os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr")
        else:
            os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta")
        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        os.chdir("..")
        if not debug: remove_dir(self.run_dir)

        self.set_reference_alignment() #make reference_aln object while reference still in alignment
        self.set_sequence_lookup()
        self.add_attributes_to_aln()

    def remove_reference_from_alignment(self):
        count = len(self.aln)
        self.aln = MultipleSeqAlignment([s for s in self.aln if s.name!=self.reference_seq.name])
        assert(count == (len(self.aln)+1))

    def set_reference_alignment(self):
        self.reference_aln = [x for x in list(self.aln) if x.name==self.reference_seq.name][0]

    def set_sequence_lookup(self):
        self.sequence_lookup = {seq.id:seq for seq in self.aln}

    def add_attributes_to_aln(self):
        for seqid, seq in self.seqs.items():
            self.sequence_lookup[seqid].attributes = seq.attributes

    def try_restore_align_from_disk(self, fname):
        try:
            self.aln = AlignIO.read(fname, "fasta")
        except IOError:
            return
        except Exception as e:
            self.log.notify("Error restoring from alignment... re-doing")
            print(e)
            return

        try:
            self.set_reference_alignment()
        except IndexError:
            self.log.notify("Reference not found in alignment... ok on reload")
            # del self.aln
            # return

        if not self.reference_in_dataset:
            self.remove_reference_from_alignment()

        if len({x.id for x in self.aln} ^ set(self.seqs.keys())) != 0:
            self.log.notify("Alignment on disk had different sequences... re-doing")
            del self.aln
            del self.reference_aln
            return

        # at this stage we are happy with the alignment
        self.set_sequence_lookup()
        self.add_attributes_to_aln()
        self.log.notify("Alignment restored from disk")


    def strip_non_reference(self):
        '''
        remove insertions relative to the reference from the alignment
        '''
        ungapped = np.array(self.reference_aln)!='-'
        for seq in self.aln:
            seq.seq = Seq("".join(np.array(seq)[ungapped]))


    def make_gaps_ambiguous(self):
        '''
        replace all gaps by 'N' in all sequences in the alignment. TreeTime will treat them
        as fully ambiguous and replace then with the most likely state
        '''
        for seq in self.aln:
            seq_array = np.array(seq)
            gaps = seq_array=='-'
            seq_array[gaps]='N'
            seq.seq = Seq("".join(seq_array))


    def make_terminal_gaps_ambiguous(self):
        '''
        replace all gaps at the end of sequences by 'N' in the alignment. TreeTime will treat them
        as fully ambiguous and replace then with the most likely state
        '''
        for seq in self.aln:
            str_seq = str(seq.seq)
            lgaps = len(str_seq) - len(str_seq.lstrip('-'))
            rgaps = len(str_seq) - len(str_seq.rstrip('-'))
            str_seq = 'N'*lgaps + str_seq.strip('-') + 'N'*rgaps
            seq.seq = Seq(str_seq)


    def translate(self):
        '''
        make alignments of translations.
        '''
        from Bio.Seq import CodonTable
        codon_table  = CodonTable.ambiguous_dna_by_name['Standard'].forward_table
        self.translations={}
        if not hasattr(self, "proteins"): # ensure dictionary to hold annotation
            self.proteins={}

        # add a default translation of the entire sequence unless otherwise specified
        if len(self.proteins)==0:
            self.proteins.update({'cds':FeatureLocation(start=0,
                end=self.aln.get_alignment_length(), strand=1)})

        # loop over all proteins and create one MSA for each
        for prot in self.proteins:
            aa_seqs = []
            for seq in self.aln:
                tmpseq = self.proteins[prot].extract(seq)
                translated_seq, translation_exception = safe_translate(str(tmpseq.seq), report_exceptions=True)

                if translation_exception:
                    self.log.notify("Trouble translating because of invalid codons %s" % seq.id)

                tmpseq.seq = Seq(translated_seq)

                # copy attributes
                tmpseq.attributes = seq.attributes
                aa_seqs.append(tmpseq)

            self.translations[prot] = MultipleSeqAlignment(aa_seqs)

    def clock_filter(self, root_seq=None, n_iqd=3, max_gaps = 1.0, plot=False):
        '''
        remove sequences form the set that are that evolve much faster or slower
        compared the majority. Regions with predominantly gaps can be removed since
        this can skew the evolutionary rates.
        '''
        if root_seq is None: # use consensus
            af = calc_af(self.aln, nuc_alpha)
            root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)]
        if type(root_seq)==str and root_seq in self.sequence_lookup:
            root_seq = np.array(self.sequence_lookup[root_seq])
        if max_gaps<1.0:
            af=calc_af(self.aln, nuc_alpha)
            good_pos = af[nuc_alpha.index('-')]<max_gaps
        else:
            good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool)
        date_vs_distance = {}
        # self.reference_aln = None already set at alignment step
        for seq in self.aln:
            date_vs_distance[seq.id] = (seq.attributes['num_date'],
                np.mean((np.array(seq)!=root_seq)[(np.array(seq)!='-')&(root_seq!='-')&good_pos]))
            # if seq.id==self.reference.id:
            #     self.reference_aln = seq
        date_vs_distance_array=np.array(date_vs_distance.values())
        from scipy.stats import linregress, scoreatpercentile
        slope, intercept, rval, pval, stderr = linregress(date_vs_distance_array[:,0], date_vs_distance_array[:,1])
        print("distance vs time regression:",slope)
        residuals = (intercept + slope*date_vs_distance_array[:,0]) - date_vs_distance_array[:,1]
        IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(residuals,25)
        if plot:
            import matplotlib.pyplot as plt
            plt.ion()
            plt.scatter(date_vs_distance_array[:,0], date_vs_distance_array[:,1], c='g')
            bad_points = abs(intercept+slope*date_vs_distance_array[:,0] - date_vs_distance_array[:,1])>n_iqd*IQD
            plt.scatter(date_vs_distance_array[bad_points,0], date_vs_distance_array[bad_points,1], c='r')


        print("before clock filter:",len(self.aln))
        tmp = {seq.id:seq for seq in self.aln
                if abs(intercept+slope*date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1])<n_iqd*IQD}
        if self.reference.id not in tmp and self.reference.reference_in_dataset:
            self.log.notify('adding reference again after clock filter')
            tmp[self.reference.id] = self.reference_aln
        self.aln = MultipleSeqAlignment(tmp.values())
        print("after clock filter:",len(self.aln))

    def diversity_statistics(self):
        ''' calculate alignment entropy of nucleotide and optionally protein alignments '''
        if not hasattr(self, "aln"):
            self.log.fatal("Diversity statistics calculated before alignment generated.")
            return
        aln_array = np.array(self.aln)
        self.af = {'nuc': calc_af(self.aln, nuc_alpha)}
        tmp_af = self.af['nuc'][:-2]/self.af['nuc'][:-2].sum(axis=0)
        self.entropy ={'nuc': -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)}

        if hasattr(self, "translations"):
            for prot, aln in self.translations.items():
                self.af[prot] = calc_af(aln, aa_alpha)
                tmp_af = self.af[prot][:-2]/self.af[prot][:-2].sum(axis=0)
                self.entropy[prot] = -(tmp_af*np.log(tmp_af+TINY)).sum(axis=0)

    def export_diversity(self, fname = 'entropy.json', indent=None):
        '''
        write the alignment entropy of each alignment (nucleotide and translations) to file
        '''
        if not hasattr(self, "entropy"):
            self.diversity_statistics()
        entropy_json = {}
        for feat in self.entropy:
            S = [max(0,round(x,4)) for x in self.entropy[feat]]
            n = len(S)
            if feat=='nuc':
                entropy_json[feat] = {'pos':list(range(0,n)), 'codon':[x//3 for x in range(0,n)], 'val':S}
            else:
                entropy_json[feat] = {'pos':[x for x in self.proteins[feat]][::3],
                                      'codon':[(x-self.proteins[feat].start)//3 for x in self.proteins[feat]][::3], 'val':S}
        write_json(entropy_json, fname, indent=indent)
Пример #27
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        # Whitelisted headers we know about.
        known_headers = [
            "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"
        ]
        # Examples in "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001
        # would often start as follows:
        #
        # !!AA_MUTIPLE_ALIGNMENT 1.0
        # PileUp of: @/usr/users2/culhane/...
        #
        # etc with other seemingly free format text before getting to the
        # MSF/Type/Check line and the following Name: lines block and // line.
        #
        # MUSCLE just has a line "PileUp", while other sources just use the line
        # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT"
        # (nucleotide).
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known GCG MSF header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        while line and " MSF: " not in line:
            line = handle.readline()

        if not line:
            raise ValueError(
                "Reached end of file without MSF/Type/Check header line")

        # Quoting from "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001.
        # Page 31:
        #
        # "Header information is before a .. (double dot) in a GCG format file.
        #  The file will also have a checksum specific for that file."
        #
        # This was followed by a single non-aligned sequence, but this convention
        # appears to also be used in the GCG MSF files. Quoting other examples in
        # this reference, page 31:
        #
        # localpileup_17.msf  MSF: 195  Type: P  January 6, 2000 15:41  Check: 4365 ..
        #
        # Except from page 148:
        #
        # localpileup_106.msf  MSF: 457  Type: P  November 28, 2000 16:09  Check: 2396 ..
        #
        # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum:
        #
        #   MSF: 689  Type: N  Check: 0000  ..
        #
        # By observation, the MSF value is the column count, type is N (nucleotide)
        # or P (protein / amino acid).
        #
        # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown,
        #
        # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf
        # !!NA_MULTIPLE_ALIGNMENT 1.0
        #
        #   stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 ..
        #
        #   Name: G26680     Len: 633  Check: 4334 Weight: 1.00
        #   Name: G26685     Len: 633  Check: 3818 Weight: 1.00
        #   Name: G29385     Len: 633  Check:  391 Weight: 1.00
        #
        # //
        #
        parts = line.strip("\n").split()
        offset = parts.index("MSF:")
        if (parts[offset + 2] != "Type:"
                or parts[-3] not in ("Check:", "CompCheck:")
                or parts[-1] != ".."):
            raise ValueError(
                "GCG MSF header line should be "
                "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', "
                " not: %r" % line)
        try:
            aln_length = int(parts[offset + 1])
        except ValueError:
            aln_length = -1
        if aln_length < 0:
            raise ValueError(
                "GCG MSF header line should have MDF: <int> for column count, not %r"
                % parts[offset + 1])
        seq_type = parts[offset + 3]
        if seq_type not in ["P", "N"]:
            raise ValueError(
                "GCG MSF header line should have 'Type: P' (protein) "
                "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type)

        # There should be a blank line after that header line, then the Name: lines
        #
        # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here,
        #
        # PileUp
        #
        #
        #
        #    MSF:  628  Type: P    Check:   147   ..
        #
        #  Name: AK1H_ECOLI/1-378 oo  Len:  628  Check:  3643  Weight:  1.000
        #  Name: AKH_HAEIN/1-382 oo  Len:  628  Check:  6504  Weight:  1.000
        #
        # //
        ids = []
        lengths = []
        checks = []
        weights = []
        line = handle.readline()
        while line and line.strip() != "//":
            line = handle.readline()
            if line.strip().startswith("Name: "):
                if " Len: " in line and " Check: " in line and " Weight: " in line:
                    rest = line[line.index("Name: ") + 6:].strip()
                    name, rest = rest.split(" Len: ")
                    length, rest = rest.split(" Check: ")
                    check, weight = rest.split(" Weight: ")
                    name = name.strip()
                    if name.endswith(" oo"):
                        # T-COFFEE oddity, ignore this
                        name = name[:-3]
                    if name in ids:
                        raise ValueError("Duplicated ID of %r" % name)
                    if " " in name:
                        raise NotImplementedError("Space in ID %r" % name)
                    ids.append(name)
                    # Expect aln_length <= int(length.strip()), see below
                    lengths.append(int(length.strip()))
                    checks.append(int(check.strip()))
                    weights.append(float(weight.strip()))
                else:
                    raise ValueError("Malformed GCG MSF name line: %r" % line)
        if not line:
            raise ValueError(
                "End of file while looking for end of header // line.")

        if aln_length != max(lengths):
            # In broken examples from IMGTHLA was possible to continue
            # https://github.com/ANHIG/IMGTHLA/issues/201
            max_length = max(lengths)
            max_count = sum(1 for _ in lengths if _ == max_length)
            raise ValueError(
                "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s"
                % (aln_length, max_count, len(ids), max_length))

        line = handle.readline()
        if not line:
            raise ValueError("End of file after // line, expected sequences.")
        if line.strip():
            raise ValueError(
                "After // line, expected blank line before sequences.")

        # Now load the sequences
        seqs = [[] for _ in ids]  # list of empty lists
        completed_length = 0
        while completed_length < aln_length:
            # Note might have a coordinate header line (seems to be optional)
            for idx, name in enumerate(ids):
                line = handle.readline()
                if idx == 0 and not line.strip():
                    # T-COFFEE uses two blank lines between blocks, rather than one
                    while line and not line.strip():
                        line = handle.readline()
                if not line:
                    raise ValueError(
                        "End of file where expecting sequence data.")
                # print("Looking for seq for %s in line: %r" % (name, line))
                words = line.strip().split()
                # Should we use column numbers, rather than assuming no spaces in names?
                if idx == 0 and words and words[0] != name:
                    # print("Actually have a coord line")
                    # Hopefully this is a coordinate header before the first seq
                    try:
                        i = int(words[0])
                    except ValueError:
                        i = -1
                    if i != completed_length + 1:
                        raise ValueError(
                            "Expected GCG MSF coordinate line starting %i, got: %r"
                            % (completed_length + 1, line))
                    if len(words) > 1:
                        # Final block usually not full 50 chars, so expect start only.
                        if len(words) != 2:
                            i = -1
                        else:
                            try:
                                i = int(words[1])
                            except ValueError:
                                i = -1
                        if i != (completed_length + 50 if completed_length +
                                 50 < aln_length else aln_length):
                            raise ValueError(
                                "Expected GCG MSF coordinate line %i to %i, got: %r"
                                % (
                                    completed_length + 1,
                                    completed_length + 50 if completed_length +
                                    50 < aln_length else aln_length,
                                    line,
                                ))
                    line = handle.readline()
                    words = line.strip().split()
                    # print("Still looking for seq for %s in line: %r" % (name, line))
                # Dealt with any coordinate header line, should now be sequence
                if not words:
                    # Should be sequence here, but perhaps its a short one?
                    if (lengths[idx] < aln_length
                            and len("".join(seqs[idx])) == lengths[idx]):
                        # Is this actually allowed in the format? Personally I would
                        # expect a line with name and a block of trailing ~ here.
                        pass
                    else:
                        raise ValueError("Expected sequence for %s, got: %r" %
                                         (name, line))
                elif words[0] == name:
                    assert len(words) > 1, line
                    # print(i, name, repr(words))
                    seqs[idx].extend(words[1:])
                else:
                    raise ValueError("Expected sequence for %r, got: %r" %
                                     (name, line))
            # TODO - check the sequence lengths thus far are consistent
            # with blocks of 50?
            completed_length += 50
            line = handle.readline()
            if line.strip():
                raise ValueError("Expected blank line, got: %r" % line)

        # Skip over any whitespace at the end...
        while True:
            line = handle.readline()
            if not line:
                # End of file, no more alignments
                break
            elif not line.strip():
                # Blank line, ignore
                pass
            elif line.strip().split()[0] in known_headers:
                # Looks like the start of another alignment:
                self._header = line
                break
            else:
                raise ValueError(
                    "Unexpected line after GCG MSF alignment: %r" % line)

        # Combine list of strings into single string, remap gaps
        seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs]

        # Apply any trailing padding for short sequences
        padded = False
        for idx, (length, s) in enumerate(zip(lengths, seqs)):
            if len(s) < aln_length and len(s) == length:
                padded = True
                seqs[idx] = s + "-" * (aln_length - len(s))
        if padded:
            import warnings
            from Bio import BiopythonParserWarning

            warnings.warn(
                "One of more alignment sequences were truncated and have been gap padded",
                BiopythonParserWarning,
            )

        records = (SeqRecord(
            Seq(s),
            id=i,
            name=i,
            description=i,
            annotations={"weight": w},
        ) for (i, s, w) in zip(ids, seqs, weights))

        # This will check alignment lengths are self-consistent:
        align = MultipleSeqAlignment(records)
        # Check matches the header:
        if align.get_alignment_length() != aln_length:
            raise ValueError(
                "GCG MSF headers said alignment length %i, but have %i" %
                (aln_length, align.get_alignment_length()))
        return align
Пример #28
0
class sequence_set(object):
    def __init__(self, logger, sequences, reference, dateFormat):
        super(sequence_set, self).__init__()
        self.log = logger

        # load sequences from the (parsed) JSON - don't forget to sort out dates
        self.seqs = {}
        for name, data in sequences.iteritems():
            self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
                                        id=name,
                                        name=name,
                                        description=name)
            self.seqs[name].attributes = data["attributes"]
            # tidy up dates
            date_struc = parse_date(self.seqs[name].attributes["raw_date"],
                                    dateFormat)
            self.seqs[name].attributes["num_date"] = date_struc[1]
            self.seqs[name].attributes["date"] = date_struc[2]

        # if the reference is to be analysed it'll already be in the (filtered & subsampled)
        # sequences, so no need to add it here, and no need to care about attributes etc
        # we do, however, need it for alignment
        self.reference_in_dataset = reference["included"]
        name = reference["strain"]
        self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
                                       id=name,
                                       name=name,
                                       description=name)
        if "genes" in reference and len(reference["genes"]):
            self.proteins = {
                k: FeatureLocation(start=v["start"],
                                   end=v["end"],
                                   strand=v["strand"])
                for k, v in reference["genes"].iteritems()
            }
        else:
            self.proteins = None

        # other things:
        self.run_dir = '_'.join([
            'temp',
            time.strftime('%Y%m%d-%H%M%S', time.gmtime()),
            str(random.randint(0, 1000000))
        ])
        self.nthreads = 2  # should load from config file

    def convert_trait_to_numerical_date(self, trait, dateFormat):
        for name, seq in self.seqs.iteritems():
            try:
                date_struc = parse_date(seq.attributes[trait], dateFormat)
                seq.attributes[trait] = date_struc[1]
            except KeyError:
                self.log.warn(
                    "Attribute {} not found for sequence {}. Ignoring".format(
                        trait, seq.name))

    def codon_align(self):
        self.log.fatal("Codon align not yet implemented")

    def align(self, fname, debug=False):
        '''
        align sequences using mafft

        side-effects:
            self.aln {MultipleSeqAlignment} reference not present if not in self.seqs
            self.reference_aln {SeqRecord} always set, even if the reference is subsequently discarded
            self.sequence_lookup {dict} map linking seq.id to the alignment, potentialy without the reference
            saves the alignment (always including reference) to fname
        '''
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        if self.reference_in_dataset:
            out_seqs = self.seqs.values()
        else:
            self.log.notify("Adding reference for alignment step")
            out_seqs = self.seqs.values() + [self.reference_seq]

        SeqIO.write(out_seqs, "temp_in.fasta", "fasta")
        self.log.notify("Running alignment")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) +
                  " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr")
        self.aln = AlignIO.read('temp_out.fasta', 'fasta')
        os.chdir("..")
        os.rename(os.path.join(self.run_dir, "temp_out.fasta"), fname)
        if not debug: remove_dir(self.run_dir)

        self.set_reference_alignment()
        if not self.reference_in_dataset:
            self.remove_reference_from_alignment()
        self.set_sequence_lookup()
        self.add_attributes_to_aln()

    def remove_reference_from_alignment(self):
        count = len(self.aln)
        self.aln = MultipleSeqAlignment(
            [s for s in self.aln if s.name != self.reference_seq.name])
        assert (count == (len(self.aln) + 1))

    def set_reference_alignment(self):
        self.reference_aln = [
            x for x in list(self.aln) if x.name == self.reference_seq.name
        ][0]

    def set_sequence_lookup(self):
        self.sequence_lookup = {seq.id: seq for seq in self.aln}

    def add_attributes_to_aln(self):
        for seqid, seq in self.seqs.iteritems():
            self.sequence_lookup[seqid].attributes = seq.attributes

    def try_restore_align_from_disk(self, fname):
        try:
            self.aln = AlignIO.read(fname, "fasta")
        except IOError:
            return
        except Exception as e:
            self.log.notify("Error restoring from alignment... re-doing")
            print(e)
            return

        try:
            self.set_reference_alignment()
        except IndexError:
            self.log.notify("Reference not found in alignment... re-doing")
            del self.aln
            return

        if not self.reference_in_dataset:
            self.remove_reference_from_alignment()

        if len({x.id for x in self.aln} ^ set(self.seqs.keys())) != 0:
            self.log.notify(
                "Alignment on disk had different sequnces... re-doing")
            del self.aln
            del self.reference_aln
            return

        # at this stage we are happy with the alignment
        self.set_sequence_lookup()
        self.add_attributes_to_aln()
        self.log.notify("Alignment restored from disk")

    def strip_non_reference(self):
        ungapped = np.array(self.reference_aln) != '-'
        for seq in self.aln:
            seq.seq = Seq("".join(np.array(seq)[ungapped]))

    def remove_terminal_gaps(self):
        for seq in self.aln:
            seq_array = np.array(seq)
            seq_string = str(seq.seq)
            if (seq_array == '-').sum():
                left_gaps = len(seq_string) - len(seq_string.lstrip('-'))
                seq_array[:left_gaps] = 'N'
            if (seq_array == '-').sum():
                right_gaps = len(seq_string) - len(seq_string.rstrip('-'))
                if right_gaps:
                    seq_array[-right_gaps:] = 'N'
            seq.seq = Seq("".join(seq_array))

    def translate(self):
        '''
        make alignment of translations
        '''
        self.translations = {}
        if not hasattr(self,
                       "proteins"):  # ensure dictionary to hold annotation
            self.proteins = {}

        # add a default translation of the entire sequence unless otherwise specified
        if len(self.proteins) == 0:
            self.proteins.update({
                'cds':
                FeatureLocation(start=0,
                                end=self.aln.get_alignment_length(),
                                strand=1)
            })

        for prot in self.proteins:
            aa_seqs = []
            for seq in self.aln:
                try:
                    # soon not needed as future biopython version will translate --- into -
                    tmpseq = self.proteins[prot].extract(seq)
                    tmpseq.attributes = seq.attributes
                    internal_gap = np.unique(
                        np.where(np.array(tmpseq) == '-')[0] // 3)
                    aa_seq = np.array(
                        Seq(str(tmpseq.seq).replace('---', 'NNN')).translate())
                    aa_seq[internal_gap] = '-'
                    tmpseq.seq = Seq("".join(aa_seq))
                except:
                    tmpseq.seq = Seq("".join([
                        x if x in 'ACGT' else 'N' for x in str(tmpseq.seq)
                    ])).translate()
                    print("Trouble translating", seq.id)
                aa_seqs.append(tmpseq)
            self.translations[prot] = MultipleSeqAlignment(aa_seqs)

    def clock_filter(self, root_seq=None, n_iqd=3, max_gaps=1.0, plot=False):
        '''
        remove sequences form the set that are that evolve much faster or slower
        compared the majority. Regions with predominantly gaps can be removed since
        this can skew the evolutionary rates.
        '''
        if root_seq is None:  # use consensus
            af = calc_af(self.aln, nuc_alpha)
            root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)]
        if type(root_seq) == str and root_seq in self.sequence_lookup:
            root_seq = np.array(self.sequence_lookup[root_seq])
        if max_gaps < 1.0:
            af = calc_af(self.aln, nuc_alpha)
            good_pos = af[nuc_alpha.index('-')] < max_gaps
        else:
            good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool)
        date_vs_distance = {}
        # self.reference_aln = None already set at alignment step
        for seq in self.aln:
            date_vs_distance[seq.id] = (seq.attributes['num_date'],
                                        np.mean(
                                            (np.array(seq) !=
                                             root_seq)[(np.array(seq) != '-')
                                                       & (root_seq != '-')
                                                       & good_pos]))
            # if seq.id==self.reference.id:
            #     self.reference_aln = seq
        date_vs_distance_array = np.array(date_vs_distance.values())
        from scipy.stats import linregress, scoreatpercentile
        slope, intercept, rval, pval, stderr = linregress(
            date_vs_distance_array[:, 0], date_vs_distance_array[:, 1])
        print("distance vs time regression:", slope)
        residuals = (intercept + slope * date_vs_distance_array[:, 0]
                     ) - date_vs_distance_array[:, 1]
        IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(
            residuals, 25)
        if plot:
            import matplotlib.pyplot as plt
            plt.ion()
            plt.scatter(date_vs_distance_array[:, 0],
                        date_vs_distance_array[:, 1],
                        c='g')
            bad_points = abs(intercept + slope * date_vs_distance_array[:, 0] -
                             date_vs_distance_array[:, 1]) > n_iqd * IQD
            plt.scatter(date_vs_distance_array[bad_points, 0],
                        date_vs_distance_array[bad_points, 1],
                        c='r')

        print("before clock filter:", len(self.aln))
        tmp = {
            seq.id: seq
            for seq in self.aln
            if abs(intercept + slope * date_vs_distance[seq.id][0] -
                   date_vs_distance[seq.id][1]) < n_iqd * IQD
        }
        if self.reference.id not in tmp and self.reference.reference_in_dataset:
            self.log.notify('adding reference again after clock filter')
            tmp[self.reference.id] = self.reference_aln
        self.aln = MultipleSeqAlignment(tmp.values())
        print("after clock filter:", len(self.aln))

    def diversity_statistics(self):
        ''' calculate alignment entropy of nucleotide and optionally protein alignments '''
        if not hasattr(self, "aln"):
            self.log.fatal(
                "Diversity statistics calculated before alignment generated.")
            return
        aln_array = np.array(self.aln)
        self.af = {'nuc': calc_af(self.aln, nuc_alpha)}
        tmp_af = self.af['nuc'][:-2] / self.af['nuc'][:-2].sum(axis=0)
        self.entropy = {'nuc': -(tmp_af * np.log(tmp_af + TINY)).sum(axis=0)}

        if hasattr(self, "translations"):
            for prot, aln in self.translations.iteritems():
                self.af[prot] = calc_af(aln, aa_alpha)
                tmp_af = self.af[prot][:-2] / self.af[prot][:-2].sum(axis=0)
                self.entropy[prot] = -(tmp_af *
                                       np.log(tmp_af + TINY)).sum(axis=0)

    def export_diversity(self, fname='entropy.json', indent=None):
        '''
        write the alignment entropy of each alignment (nucleotide and translations) to file
        '''
        if not hasattr(self, "entropy"):
            self.diversity_statistics()
        entropy_json = {}
        for feat in self.entropy:
            S = [max(0, round(x, 4)) for x in self.entropy[feat]]
            n = len(S)
            if feat == 'nuc':
                entropy_json[feat] = {
                    'pos': range(0, n),
                    'codon': [x // 3 for x in range(0, n)],
                    'val': S
                }
            else:
                entropy_json[feat] = {
                    'pos': [x for x in self.proteins[feat]][::3],
                    'codon': [(x - self.proteins[feat].start) // 3
                              for x in self.proteins[feat]][::3],
                    'val':
                    S
                }
        write_json(entropy_json, fname, indent=indent)
Пример #29
0
class DisplayedAlignment(object):
    """
    Provides tools for displaying and manipulating an alignment and storing all previous versions
    """

    def __init__(self, alignment):
        self.displayedColumn = 0
        self.alignment = alignment
        self.alignmentHistory = [alignment[:,:]]
        self.changed = False
        self.translated = False
        self.translationTable = 1

    def ParseIndex(self, text):
        """
        Parses a text string specifying a range of rows (taxa) and columns.  Expects the text to be in the format used to specify a range from a Bio.Align.MultipleSeqAlignment.  Returns indices for the start and stop taxon and the start and stop columns
        """
        taxonStart = 0
        taxonStop = len(self.alignment) - 1
        columnStart = 0
        columnStop = self.alignment.get_alignment_length() - 1
        if (',' not in text):
            self.AlertMessage('Invalid index format.  (taxa or columns missing)', 'high')
            return (-1,-1,-1,-1)
        else:
            text = text.strip()
            indices = text.split(',')
            if (len(indices) > 2):
                self.AlertMessage('Invalid index format.  (too many fields)', 'high')
                return (-1,-1,-1,-1)
            else:
                if (':' in indices[0]): #there is a range specified in the taxon index
                    taxonIndices = indices[0].split(':')
                    if (taxonIndices[0]): #a start taxon is specified
                        try:
                            taxonStart = int(taxonIndices[0].strip())
                        except:
                            self.AlertMessage('Invalid index format. (taxon start index not an integer)', 'high')
                            return (-1, -1, -1, -1)
                    if (taxonIndices[1]): #a stop taxon is specified
                        try:
                            taxonStop = int(taxonIndices[1].strip())
                        except:
                            self.AlertMessage('Invalid index format. (taxon stop index not an integer)', 'high')
                            return (-1, -1, -1, -1)
                elif (indices[0]): #a single taxon is specified
                    try:
                        taxonStart = int(indices[0].strip())
                        taxonStop = int(indices[0].strip())
                    except:
                        self.AlertMessage('Invalid index format. (taxon start or stop index not an integer)', 'high')
                        return (-1, -1, -1, -1)
                if (':' in indices[1]): #there is a range specified in the taxon index
                    columnIndices = indices[1].split(':')
                    if (columnIndices[0]): #a start taxon is specified
                        try:
                            columnStart = int(columnIndices[0].strip())
                        except:
                            self.AlertMessage('Invalid index format. (column start index not an integer)', 'high')
                            return (-1, -1, -1, -1)
                    if (columnIndices[1]): #a stop taxon is specified
                        try:
                            columnStop = int(columnIndices[1].strip())
                        except:
                            self.AlertMessage('Invalid index format. (column stop index not an integer)', 'high')
                            return (-1, -1, -1, -1)
                elif (indices[1]): #a single taxon is specified
                    try:
                        columnStart = int(indices[1].strip())
                        columnStop = int(indices[1].strip())
                    except:
                        self.AlertMessage('Invalid index format. (column start or stop index not an integer)', 'high')
                        return (-1, -1, -1, -1)
                if ((0 <= taxonStart <= taxonStop) & (0 <= columnStart <= columnStop)):
                    return (taxonStart, taxonStop, columnStart, columnStop)
                else:
                    self.AlertMessage('Invalid index range. (start > stop or index < 0)', 'high')
                    return (-1,-1,-1,-1)

    def ColorizeDNA(self, text):
        """
        Colorizes output based on nucleotide
        """
        if (text == 'A'):
            escape = '\033[92m' # Green
        elif (text == 'G'):
            escape = '\033[93m' # Yellow
        elif (text == 'T'):
            escape = '\033[91m' # Red
        elif (text == 'C'):
            escape = '\033[96m' # Blue
        else:
            return text
        return escape + text + '\033[0m'

    def ColorizeAA(self, text):
        """
        Colorize output based on amino acid polarity or nonpolarity
        """
        if (text in ['A', 'F', 'H', 'I', 'K', 'L', 'M', 'P', 'R', 'V', 'W']):
            escape = '\033[91m' # Red
        elif (text in ['C', 'G', 'N', 'Q', 'S', 'T', 'Y', 'B', 'Z']):
            escape = '\033[96m' # Blue
        elif (text in ['D', 'E']):
            escape = '\033[92m' # Green
        elif (text in ['X', '*']):
            escape = '\033[93m' # Yellow
        else:
            return text
        return escape + text + '\033[0m'

    def AlertMessage(self, text, severity='low'):
        """
        Display an alert message with a tag and color corresponding to the severity of the alert ('low', 'medium', 'high')
        """
        if (severity == 'high'):
            escape = '\033[91m' # Red
            tag = '!!!'
        elif (severity == 'medium'):
            escape = '\033[93m' # Yellow
            tag = '***'
        else:
            escape = '\033[92m' # Green
            tag = '   '
        print escape + tag, text, tag + '\033[0m'

    def Show(self, column=0):
        """
        Displays 100 columns of the alignment beginning at 'column'
        """
        if column < 0:
            column = 0
        row = 0
        marker = '|    :    ' * 10
        spacer = ' ' * 15
        markerRow = spacer + marker
        if (self.translated == False):
            indexRow = spacer
            for index in range(column, column + 100, 10):
                indexRow = indexRow + str(index).ljust(10)
            print indexRow
            print markerRow
            for sequence in self.alignment[:,column:column + 100]:
                print '%2d) %10s' % (row, sequence.id),
                dnaSequence = ''
                for nucleotide in str(sequence.seq):
                    dnaSequence += self.ColorizeDNA(nucleotide)
                print dnaSequence,
                if (column + 100 < self.alignment.get_alignment_length()):
                    print '...'
                else:
                    print
                row += 1
            print markerRow
            print indexRow
        else:
            indexRow = spacer
            for index in range(column / 3, (column / 3) + 100, 10):
                indexRow = indexRow + str(index).ljust(10)
            print indexRow
            print markerRow
            for sequence in self.alignment[:, column:column + 300]:
                proteinSequence = ''
                for codonPosition in range(0, len(sequence), 3):
                    codon = sequence.seq[codonPosition:codonPosition + 3]
                    if (str(codon) == '---'):
                        proteinSequence += '-'
                    elif ('-' in codon):
                        proteinSequence += '?'
                    else:
                        proteinSequence += self.ColorizeAA(str(codon.translate(table = self.translationTable)))
                print '%2d) %10s %s' % (row, sequence.id, proteinSequence),
                if (column + 300 < self.alignment.get_alignment_length()):
                    print '...'
                else:
                    print
                row += 1
            print markerRow
            print indexRow
        self.displayedColumn = column

    def BackupAlignment(self):
        """
        Stores the current alignment state to the alignment change history
        """
        self.alignmentHistory.append(self.alignment[:,:])

    def UndoChanges(self):
        """
        Reverts to the previous state in the alignment change history.  Does not effect which column index is displayed or whether the sequence is displayed as translated or not since those are not changes to the alignment.
        """
        if (len(self.alignmentHistory) > 1):
            self.alignmentHistory.pop()
            self.alignment = self.alignmentHistory[-1][:,:]
            self.Show(self.displayedColumn)
        else:
            self.AlertMessage('Nothing to undo.', 'low')

    def DeleteRange(self, rangeText, silent=False):
        """
        Removes a row and column range from the alignment
        """
        startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(rangeText)
        if (self.translated == True):
            startColumn = startColumn * 3
            stopColumn = (stopColumn * 3) + 2
        if (startTaxon >= 0): #Make sure we had a valid range
            changeLength = 0
            deleteTaxon = False
            if ((startColumn == 0) & (stopColumn == len(self.alignment[0]) - 1)):
                deleteTaxon = True
            if ((startTaxon > 0) | (stopTaxon < len(self.alignment) - 1)):
                changeLength = (stopColumn - startColumn) + 1
            taxon = 0
            newSequences = []
            for Sequence in self.alignment:
                if (taxon in range(startTaxon, stopTaxon + 1)):
                    if (not deleteTaxon):
                        if (startColumn > 0):
                            Sequence.seq = Sequence.seq[:startColumn] + Sequence.seq[stopColumn + 1:]
                        else:
                            Sequence.seq = Sequence.seq[stopColumn + 1:]
                        if (changeLength):
                            Sequence.seq = Sequence.seq + Seq('-' * changeLength)
                        newSequences.append(Sequence)
                else:
                    newSequences.append(Sequence)
                taxon += 1
            self.alignment = MultipleSeqAlignment(newSequences)
            if (not silent):
                self.Show(self.displayedColumn)
                self.BackupAlignment()

    def ModifyRange(self, rangeText, nucleotide='-'):
        """
        Changes the nucleotides in a row and column range to a specified nucleotide.  Has no effect when the alignment is translated since the corresponding change to the underlying nucleotide alignment would be ambiguous at best.
        """
        nucleotide = nucleotide.upper()
        if (self.translated == True):
            self.AlertMessage("Can't modify protein sequences.", 'medium')
        elif (nucleotide not in ['A', 'G', 'C', 'T', 'R', 'K', 'S', 'W', 'M', 'Y', 'D', 'V', 'B', 'H', 'N', '-']):
            self.AlertMessage('Invalid nucleotide.  (only AGTC- and IUB nucleotide codes are permitted)', 'high')
        else:
            startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(rangeText)
            if (startTaxon >= 0): #Make sure we have a valid range
                taxon = 0
                newSequences = []
                modificationLength = (stopColumn - startColumn) + 1
                for Sequence in self.alignment:
                    if (taxon in range(startTaxon, stopTaxon + 1)):
                        if (startColumn > 0):
                            Sequence.seq = Sequence.seq[:startColumn] + Seq(nucleotide * modificationLength) + Sequence.seq[stopColumn + 1:]
                        else:
                            Sequence.seq = Seq(nucleotide * modificationLength) + Sequence.seq[stopColumn + 1:]
                    newSequences.append(Sequence)
                    taxon += 1
                self.alignment = MultipleSeqAlignment(newSequences)
                self.Show(self.displayedColumn)
                self.BackupAlignment()

    def InsertRange(self, rangeText):
        """
        Inserts a row and column range into the alignment and fills it with gaps ('-' for nucleotides or '---' for translated alignments)
        """
        startTaxon, stopTaxon, startColumn, stopColumn = self.ParseIndex(rangeText)
        if (self.translated == True):
            startColumn = startColumn * 3
            stopColumn = (stopColumn * 3) + 2
        if (startTaxon >= 0): #Make sure we had a valid range
            changeLength = (stopColumn - startColumn) + 1
            taxon = 0
            newSequences = []
            for Sequence in self.alignment:
                if (taxon in range(startTaxon, stopTaxon + 1)):
                    if (startColumn > 0):
                        Sequence.seq = Sequence.seq[:startColumn] + Seq('-' * changeLength) + Sequence.seq[startColumn:]
                    else:
                        Sequence.seq = Seq('-' * changeLength) + Sequence.seq[:]
                else:
                    Sequence.seq = Sequence.seq + Seq('-' * changeLength)
                newSequences.append(Sequence)
                taxon +=1
            self.alignment = MultipleSeqAlignment(newSequences)
            self.Show(self.displayedColumn)
            self.BackupAlignment()

    def Jump(self, column):
        """
        Moves the displayed column to a specified column index
        """
        if (self.translated == True):
            column = column * 3
        self.Show(column)

    def ScrollRight(self, offset=100):
        """
        Scroll the display 'offset' columns to the right
        """
        if (self.translated == True):
            offset = offset * 3
        self.Show(self.displayedColumn + offset)

    def ScrollLeft(self, offset=100):
        """
        Scroll the display 'offset' columns to the left
        """
        if (self.translated == True):
            offset = offset * 3
        self.Show(self.displayedColumn - offset)

    def Reverse(self):
        """
        Reverses the order of the columns in the alignment.  Has no effect on translated sequences.
        """
        if (self.translated == False):
            self.alignment = self.alignment[:,::-1]
            self.Show(self.displayedColumn)
            self.BackupAlignment()
        else:
            self.AlertMessage("Can't reverse protein sequences.", 'medium')

    def Complement(self):
        """
        Give the complement of the alignment.  Has no effect on translated sequences.
        """
        if (self.translated == False):
            for i in range(len(self.alignment)):
                self.alignment[i].seq = self.alignment[i].seq.complement()
            self.Show(self.displayedColumn)
            self.BackupAlignment
        else:
            self.AlertMessage("Can't complement protein sequences.", 'medium')

    def ReverseComplement(self):
        """
        Reverse and complement the alignment.  Has no effect on translated sequences.
        """
        if (self.translated == False):
            for i in range(len(self.alignment)):
                self.alignment[i].seq = self.alignment[i].seq.reverse_complement()
            self.Show(self.displayedColumn)
            self.BackupAlignment()
        else:
            self.AlertMessage("Can't reverse-complement protein sequences.", 'medium')

    def Translate(self, translationTable=11):
        """
        Switch to displaying and manipulating the sequence as a protein sequence.
        Still works on translated sequences if a different translation table is specified, otherwise it backtranslated translated sequences.
        """
        if ((self.translated == False) | ((self.translated == True) & (self.translationTable != translationTable))):
            self.translated = True
            self.translationTable = translationTable
            self.displayedColumn = self.displayedColumn - (self.displayedColumn % 3)
            self.Show(self.displayedColumn)
        else:
            self.BackTranslate()

    def BackTranslate(self):
        """
        Revert to displaying and manipulating the sequence as a dna sequence.  Has no effect if the sequence is already dna.
        """
        if (self.translated == True):
            self.translated = False
            self.Show(self.displayedColumn)
        else:
            self.AlertMessage("Can't back-translate.  Alignment contains DNA sequences", 'medium')

    def Save(self, fileName='alignment.phy', alignmentFormat='phylip'):
        """
        Write alignment to disk
        """
        AlignIO.write(self.alignment, fileName, alignmentFormat)
        self.AlertMessage('Saved alignment to ' + fileName + ' in ' + alignmentFormat + ' format.', 'low')

    def CleanUp(self):
        """
        Condense the alignment by removing any columns that contain spaces in all taxa.
        """
        blankColumnPattern = re.compile('^-*$')
        blankColumns = []
        for columnIndex in range(self.alignment.get_alignment_length() - 1):
            columnValues = self.alignment[:,columnIndex]
            match = blankColumnPattern.search(columnValues)
            if (match):
                blankColumns.append(str(columnIndex))
        for column in blankColumns[::-1]:
            self.DeleteRange(',' + str(column), True)
        self.Show(self.displayedColumn)
        self.BackupAlignment()
Пример #30
0
class GenericAlign(object):
    """docstring for Align"""
    def __init__(self, input):
        self.input = input
        self.alignment = None
        self.trimmed_alignment = None
        self.perfect_trimmed_alignment = None

    def _clean(self, outtemp):
        if type(outtemp) is list:
            for f in outtemp:
                os.remove(f)
        else:
            os.remove(outtemp)
        # cleanup temp file
        try:
            os.remove(self.input)
        except:
            pass

    def _find_ends(self, forward=True):
        """determine the first (or last) position where all reads in an 
        alignment start/stop matching"""
        if forward:
            theRange = xrange(self.alignment.get_alignment_length())
        else:
            theRange = reversed(xrange(self.alignment.get_alignment_length()))
        for col in theRange:
            if '-' in self.alignment.get_column(col):
                pass
            else:
                break
        return col

    def _base_checker(self, bases, sequence, loc):
        """ensure that any trimming that occurs does not start beyong the
        end of the sequence being trimmed"""
        # deal with the case where we just want to measure out from the
        # middle of a particular sequence
        if len(loc) == 1:
            loc = (loc, loc)
        if not bases > len(sequence.seq[:loc[0]]) and \
            not bases > len(sequence.seq[loc[1]:]):
            return True

    def _record_formatter(self, temp):
        """return a string formatted as a biopython sequence record"""
        temp_record = SeqRecord(temp)
        return temp_record

    def _alignment_summary(self, alignment):
        """return summary data for an alignment object using the AlignInfo
        class from BioPython"""
        summary = AlignInfo.SummaryInfo(alignment)
        consensus = summary.dumb_consensus()
        return summary, consensus

    def _read(self, format):
        """read an alignment from the CLI - largely for testing purposes"""
        self.alignment = AlignIO.read(open(self.input, 'rU'), format)

    def get_probe_location(self):
        '''Pull the probe sequence from an alignment object and determine its position
        within the read'''
        # probe at bottom => reverse order
        for record in self.alignment[::-1]:
            if record.id == 'probe':
                start = re.search('^-*', str(record.seq))
                end = re.search('-*$', str(record.seq))
                # should be first record
                break
        # ooh, this seems so very backwards
        self.ploc = (
            start.end(),
            end.start(),
        )

    def running_average(self,
                        window_size,
                        threshold,
                        proportion=0.3,
                        k=None,
                        running_probe=False):
        # iterate across the columns of the alignment and determine presence
        # or absence of base-identity in the column
        differences = []
        members = len(self.alignment)
        if not running_probe:
            for column in xrange(self.alignment.get_alignment_length()):
                column_values = self.alignment[:, column]
                # get the count of different bases in a column (converting
                # it to a set gets only the unique values)
                column_list = list(column_values)
                # use proportional removal of gaps
                if column_list.count('-') <= int(round(proportion * members,
                                                       0)):
                    column_list = [i for i in column_list if i != '-']
                #pdb.set_trace()
                if len(set(column_list)) > 1:
                    differences.append(0)
                else:
                    differences.append(1)
        else:
            for column in xrange(self.alignment.get_alignment_length()):
                column_values = list(self.alignment[:, column])
                # drop the index of the probe from the column_values
                del column_values[k]
                # get the count of different bases in a column (converting
                # it to a set gets only the unique values).
                #
                # no need to convert to a list here because it is already one
                if len(set(column_values)) > 1:
                    differences.append(0)
                else:
                    differences.append(1)
        differences = numpy.array(differences)
        weight = numpy.repeat(1.0, window_size) / window_size
        running_average = numpy.convolve(
            differences, weight)[window_size - 1:-(window_size - 1)]
        good = numpy.where(running_average >= threshold)[0]
        # remember to add window size onto end of trim
        try:
            start_clip, end_clip = good[0], good[-1] + window_size
        except IndexError:
            start_clip, end_clip = None, None
        return start_clip, end_clip

    def trim_alignment(self,
                       method='edges',
                       remove_probe=None,
                       bases=None,
                       consensus=True,
                       window_size=20,
                       threshold=0.5,
                       proportion=0.3):
        """Trim the alignment"""
        if method == 'edges':
            # find edges of the alignment
            start = self._find_ends(forward=True)
            end = self._find_ends(forward=False)
        elif method == 'running':
            start, end = self.running_average(window_size,
                                              threshold,
                                              proportion=proportion)
        elif method == 'running-probe':
            # get position of probe
            for k, v in enumerate(self.alignment):
                if v.name == 'probe':
                    break
                else:
                    pass
            start, end = self.running_average(window_size, threshold,
                                              proportion, k, True)
        #pdb.set_trace()
        if method == 'notrim':
            self.trimmed_alignment = self.alignment
        else:
            # create a new alignment object to hold our alignment
            self.trimmed_alignment = MultipleSeqAlignment(
                [], Gapped(IUPAC.ambiguous_dna, "-"))
            for sequence in self.alignment:
                # ignore the probe sequence we added
                if (method == 'edges' or method == 'running'
                        or method == 'running-probe') and not remove_probe:
                    # it is totally retarded that biopython only gives us the option to
                    # pass the Alignment object a name and str(sequence).  Given this
                    # level of retardation, we'll fudge and use their private method
                    if start >= 0 and end:
                        self.trimmed_alignment.append(sequence[start:end])
                    else:
                        self.trimmed_alignment = None
                        break
                elif method == 'static' and not remove_probe and bases:
                    # get middle of alignment and trim out from that - there's a
                    # weakness here in that we are not actually locating the probe
                    # region, we're just locating the middle of the alignment
                    mid_point = len(sequence) / 2
                    if self._base_checker(bases, sequence, mid_point):
                        self.trimmed_alignment._records.append(
                            sequence[mid_point - bases:mid_point + bases])
                    else:
                        self.trimmed_alignment = None
                elif method == 'static' and not remove_probe and bases and self.ploc:
                    # get middle of alignment and trim out from that - there's a
                    # weakness here in that we are not actually locating the probe
                    # region, we're just locating the middle of the alignment
                    if self._base_checker(bases, sequence, self.ploc):
                        self.trimmed_alignment._records.append(
                            sequence[self.ploc[0] - bases:self.ploc[1] +
                                     bases])
                    else:
                        self.trimmed_alignment = None
                elif remove_probe and self.ploc:
                    # we have to drop to sequence level to add sequence slices
                    # where we basically slice around the probes location
                    temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.
                                                                      ploc[1]:]
                    self.trimmed_alignment._records.append( \
                            self._record_formatter(temp)
                        )
                elif method == 'static' and remove_probe and bases and self.ploc:
                    if self._base_checker(bases, sequence, self.ploc):
                        temp = sequence.seq[self.ploc[0] - bases:self.ploc[0]] + \
                                sequence.seq[self.ploc[1]:self.ploc[1] + bases]
                        self.trimmed_alignment._records.append( \
                                self._record_formatter(temp)
                            )
                    else:
                        self.trimmed_alignment = None
        # build a dumb consensus
        if consensus and self.trimmed_alignment:
            self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \
                self._alignment_summary(self.trimmed_alignment)
        if not self.trimmed_alignment:
            print "\tAlignment {0} dropped due to trimming".format(
                self.alignment._records[0].description.split('|')[1])

    def trim_ambiguous_bases(self):
        """snip ambiguous bases from a trimmed_alignment"""
        ambiguous_bases = []
        # do this by finding all ambiguous bases and then snipping the largest
        # chunk with no ambiguous bases from the entire alignment
        if not self.trimmed_alignment:
            self.perfect_trimmed_alignment = self.trimmed_alignment
        else:
            for column in xrange(
                    0, self.trimmed_alignment.get_alignment_length()):
                if 'N' in self.trimmed_alignment[:, column]:
                    ambiguous_bases.append(column)
            maximum = 0
            maximum_pos = None
            #pdb.set_trace()
            if not ambiguous_bases:
                self.perfect_trimmed_alignment = self.trimmed_alignment
            if ambiguous_bases:
                # prepend and append the start and end of the sequence so consider
                # those chunks outside the stop and start of ambiguous base runs.
                ambiguous_bases.insert(0, 0)
                ambiguous_bases.append(
                    self.trimmed_alignment.get_alignment_length() - 1)
                # create a new alignment object to hold our alignment
                self.perfect_trimmed_alignment = \
                    MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
                for pos in xrange(len(ambiguous_bases)):
                    if pos + 1 < len(ambiguous_bases):
                        difference = ambiguous_bases[pos + 1] - \
                            ambiguous_bases[pos]
                        if difference > maximum:
                            maximum = difference
                            maximum_pos = (pos, pos + 1)
                    else:
                        pass
                # make sure we catch cases where there is not best block
                if maximum_pos:
                    for sequence in self.trimmed_alignment:
                        self.perfect_trimmed_alignment.append(
                            sequence[ambiguous_bases[maximum_pos[0]] +
                                     1:ambiguous_bases[maximum_pos[1]]])
                else:
                    self.perfect_trimmed_alignment = None
def find_hypermutants(aln, thres=-2):
    '''
    custom routine to find hypermutated sequences, it splits the alignment into sub alignments
    RNA, Good sequences, hyper mutated, suspicious. In addition, it returns a subset of sequences
    that translate without stop codon (assuming the p17 amplicon)
    '''
    isRNA = np.array([True if seq.id[:4]=="days" else False for seq in aln], dtype=bool)
    RNAaln = MultipleSeqAlignment([aln[i].upper() for i in np.where(isRNA)[0]])
    DNAaln = MultipleSeqAlignment([aln[i].upper() for i in np.where(~isRNA)[0]])

    # load the RNA SNP freuqencies to determine positions variable at the RNA level
    # those are disregarded for the hypermutation classification
    RNAaf = np.zeros((len(alpha), RNAaln.get_alignment_length()))
    for seq in RNAaln:
        nucs = np.fromstring(str(seq.seq).upper(), 'S1')
        freq = float(seq.description.split('frequency_')[1].split('%')[0])*0.01
        for ni,nuc in enumerate(alpha):
            RNAaf[ni, nucs==nuc]+=freq
    RNAaf/=RNAaf.sum(axis=0)

    # if the maximal allele frequency is above 0.99, positions are considered conserved
    conserved_pos = RNAaf[:4].max(axis=0)>0.99
    consensus = np.array([alpha[ai] for ai in RNAaf.argmax(axis=0)])
    mut_hist = {'good':[], 'hyper':[], 'suspicious':[]}
    DNAaln_array = np.array(DNAaln)
    good_seqs = []
    hyper_muts = []
    suspicious = []
    nostop = []
    mut_dict = {}
    ii=0
    for a in alpha:
        for b in alpha:
            if a!=b:
                mut_dict[a+'->'+b] = ii
                ii+=1

    for si,seq in enumerate(DNAaln):
        muts = (consensus!=DNAaln[si])&conserved_pos&(DNAaln_array[si]!='-')
        tmp = defaultdict(int)
        #print(seq.name, np.where(muts)[0])
        total = muts.sum()
        mut_counts = np.zeros(30)
        for mi in np.where(muts)[0]:
            mut = consensus[mi]+'->'+DNAaln[si,mi]
            tmp[mut]+=1
            mut_counts[mut_dict[mut]]+=1
        if total<10 and (total<4 or tmp['G->A']<0.5*total):
            good_seqs.append(seq)
            mut_hist['good'].append(mut_counts)
        elif tmp['G->A']>=0.5*total:
            hyper_muts.append(seq)
            mut_hist['hyper'].append(mut_counts)
        else:
            suspicious.append(seq)
            mut_hist['suspicious'].append(mut_counts)
        if total<20 and seq.seq.ungap('-')[20:].translate().count('*')==0:
            nostop.append(seq)
    for k in mut_hist:
        mut_hist[k] = np.array(mut_hist[k])
    return RNAaln, MultipleSeqAlignment(good_seqs), MultipleSeqAlignment(hyper_muts),\
            MultipleSeqAlignment(suspicious), MultipleSeqAlignment(nostop), mut_hist
Пример #32
0
 	#print length
 	for m in missed:
 		if partnum == "-prot":
 			temp.append(SeqRecord(Seq("X"*length, alphabet = generic_protein), id=m)) #add dummies
 		else:
 			temp.append(SeqRecord(Seq("?"*(length), Gapped(IUPAC.ambiguous_dna)), id=m)) #add dummies
 	counter = 0
 	if partnum == "-prot":
 		temp2 = MultipleSeqAlignment([], alphabet = generic_protein)
 	else:
 		temp2 = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna))
 	for aliseq in align:
 		for tempseq in temp:
 			if aliseq.id == tempseq.id:
 				temp2.append(aliseq + tempseq)
 	start = align.get_alignment_length()+1
 	end = align.get_alignment_length()+length
 	prog = "working on partition "+str(fn)+": starts "+str(start)+", ends "+str(end)
 	sys.stdout.write(prog+"\r")
 	sys.stdout.flush()
 	align = temp2
 	counter += align.get_alignment_length()
 	if pf2opt == "-pf2y":
 		if partnum == "-3":
			print >> pf2cfg, fn[:-4]+"_1 = "+str(start)+" - "+str(end)+"\\3;"
			print >> pf2cfg, fn[:-4]+"_2 = "+str(start+1)+" - "+str(end)+"\\3;"
			print >> pf2cfg, fn[:-4]+"_3 = "+str(start+2)+" - "+str(end)+"\\3;"
		elif partnum == "-1":
			print >> pf2cfg, fn[:-4]+" = "+str(start)+" - "+str(end)+";"
		elif partnum == "-prot":
			print >> pf2cfg, fn[:-4]+" = "+str(start)+" - "+str(end)+";"
Пример #33
0
# standard library
import os

# biopython
from Bio import Alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.SubsMat import FreqTable
from Bio.Align import MultipleSeqAlignment

#Very simple tests on an empty alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
assert alignment.get_alignment_length() == 0
assert len(alignment) == 0
del alignment

#Basic tests on simple three string alignment
alignment = MultipleSeqAlignment([], Alphabet.generic_alphabet)
letters = "AbcDefGhiJklMnoPqrStuVwxYz"
alignment.append(SeqRecord(Seq(letters), id="mixed"))
alignment.append(SeqRecord(Seq(letters.lower()), id="lower"))
alignment.append(SeqRecord(Seq(letters.upper()), id="upper"))
assert alignment.get_alignment_length() == 26
assert len(alignment) == 3
assert str(alignment[0].seq) == letters
assert str(alignment[1].seq) == letters.lower()
assert str(alignment[2].seq) == letters.upper()
assert alignment[0].id == "mixed"
Пример #34
0
toRemove = open(args.taxa)

TaxatoRemove = []

for line in toRemove:
    taxon = line.strip("\n")
    TaxatoRemove.append(taxon)

keepers = []

for item in allTaxa:
    if item not in TaxatoRemove:
        keepers.append(seq_dict[item])

# clean out gaps created by removing sequences if sequences are part of a multiple sequence alignment
if 'True' in args.alignment:
    rawAlignment = MultipleSeqAlignment(keepers)
    goodColumns = []
    for x in range(0, rawAlignment.get_alignment_length()):
        column = rawAlignment[:, x]
        if column.count("-") < (len(rawAlignment) - 2):
            slice = rawAlignment[:, x:x + 1]
            goodColumns.append(slice)
    goodColumnsAlignment = rawAlignment[:, 0:0]
    for column in goodColumns:
        goodColumnsAlignment = goodColumnsAlignment + column
    AlignIO.write(goodColumnsAlignment, args.output, "fasta")
# otherwise, just write the sequences
else:
    SeqIO.write(keepers, args.output, "fasta")
Пример #35
0
max_gaps = gapPercent * Query_len

#Select the sequences
selected_sequences = []
for r in alignment:
    if r.seq.count('-') < max_gaps:
        selected_sequences += [r]

print 'Selected sequences: ', len(selected_sequences)
new_alignment = MultipleSeqAlignment(selected_sequences)

if Remove_gap_only_columns:
    l = len(selected_sequences)
    ee = 0
    final_alignment = []
    for i in range(new_alignment.get_alignment_length()):
        s = new_alignment[:, i]
        if s.count('.') == l:
            ee += 1
        else:
            if type(final_alignment) == list:
                final_alignment = new_alignment[:, i:i + 1]
            else:
                final_alignment += new_alignment[:, i:i + 1]
    print 'Removed gap only columns: ', ee
    print AlignIO.write(final_alignment, out_file, 'fasta')
    print final_alignment
elif Remove_all_insertions:
    import re
    handle = open(out_file, 'w+')
    for a in new_alignment:
Пример #36
0
        alignmentStart = AlignIO.read(open(filein, "r"), "fasta")
        # cree un nouvelle alignement avec que les souches voulus:
        keepListRecord = []

        for record in alignmentStart:
            if record.id not in listKeepSouche and args.listKeepFile == "ALL":
                listKeepSouche.append(record.id)
            #print(record.id)
            if record.id in listKeepSouche:
                keepListRecord.append(record)
                tableauSoucheName.append(record.id)
                if record.id not in dicoSeqSNP.keys():
                    dicoSeqSNP[record.id] = ""
        alignment = MultipleSeqAlignment(keepListRecord)
        lenAlignement = int(alignment.get_alignment_length())
        #print(alignment)
        #print(tableauSoucheName)
        #print(len(tableauSoucheName))

        for indice in range(0, lenAlignement):
            tab = list(alignment[:, indice])
            #print(tab)
            nbO = tab.count(tab[0])
            nbA = tab.count("A")
            nbC = tab.count("C")
            nbT = tab.count("T")
            nbG = tab.count("G")
            nbN = tab.count("N") + tab.count("n")
            nbGap = tab.count("-")
            sommeACTG = nbA + nbC + nbT + nbG
Пример #37
0
class sequence_set(object):
    """sequence_set subsamples a set of sequences, aligns them and exports variability statistics"""
    def __init__(self, fname=None, reference_seq=None, **kwarks):
        super(sequence_set, self).__init__()
        self.kwarks = kwarks
        self.nthreads = 2
        if fname is not None and os.path.isfile(fname):
            with myopen(fname) as seq_file:
                self.all_seqs = {
                    x.name: x
                    for x in SeqIO.parse(seq_file, 'fasta')
                }
        elif 'virus' in kwarks:
            self.from_vdb(kwarks['virus'])
        else:
            print('no input sequences found -- empty sequence set')
            return

        if 'run_dir' not in kwarks:
            import random
            self.run_dir = '_'.join([
                'temp',
                time.strftime('%Y%m%d-%H%M%S', time.gmtime()),
                str(random.randint(0, 1000000))
            ])
        else:
            self.run_dir = kwarks['run_dir']

        if reference_seq is not None:
            if type(reference_seq) is str and reference_seq in self.all_seqs:
                self.reference_seq = self.all_seqs[reference_seq]
            else:
                self.reference_seq = reference_seq
        else:
            self.reference_seq = None

    def parse(self, fields, sep='|', strip='_'):
        '''
        split the sequence description and add annotations to sequences
        '''
        for seq in self.all_seqs.values():
            if not hasattr(seq, "attributes"): seq.attributes = {}
            words = map(lambda x: x.strip(strip),
                        seq.description.replace(">", "").split(sep))
            for ii, val in enumerate(words):
                if ii in fields:
                    if val not in ["", "-"]:
                        seq.attributes[fields[ii]] = val
                    else:
                        seq.attributes[fields[ii]] = ""
        if 'strain' in fields.values():
            self.all_seqs = {
                seq.attributes['strain']: seq
                for seq in self.all_seqs.values()
            }
            for seq in self.all_seqs.values():
                seq.id = seq.attributes['strain']
                seq.name = seq.attributes['strain']

    def ungap(self):
        '''
        remove previously existing gaps and make sure all sequences are upper case
        '''
        for seq in self.all_seqs.values():
            seq.seq = seq.seq.ungap('-').upper()

    def parse_date(self, fmts, prune=True):
        if not hasattr(self.all_seqs.values()[0], "attributes"):
            print("parse meta info first")
            return
        from datetime import datetime
        for seq in self.all_seqs.values():
            if 'date' in seq.attributes and seq.attributes['date'] != '':
                for fmt in fmts:
                    try:
                        if 'XX' in seq.attributes['date']:
                            min_date, max_date = ambiguous_date_to_date_range(
                                seq.attributes['date'], fmt)
                            seq.attributes['raw_date'] = seq.attributes['date']
                            seq.attributes['num_date'] = np.array(
                                (num_date(min_date), num_date(max_date)))
                            seq.attributes['date'] = min_date
                        else:
                            if callable(fmt):
                                tmp = fmt(seq.attributes['date'])
                            else:
                                try:
                                    tmp = datetime.strptime(
                                        seq.attributes['date'], fmt).date()
                                except:
                                    tmp = seq.attributes['date']
                            seq.attributes['raw_date'] = seq.attributes['date']
                            seq.attributes['num_date'] = num_date(tmp)
                            seq.attributes['date'] = tmp
                            break
                    except:
                        continue

        if prune:
            self.filter(func=lambda x: 'date' in x.attributes and type(
                x.attributes['date']) != str)

    def filter(self, func):
        self.all_seqs = {
            key: seq
            for key, seq in self.all_seqs.iteritems() if func(seq)
        }

    def clock_filter(self, root_seq=None, n_iqd=3, max_gaps=1.0, plot=False):
        '''
        remove sequences form the set that are that evolve much faster or slower
        compared the majority. Regions with predominantly gaps can be removed since
        this can skew the evolutionary rates.
        '''
        from Bio.Align import MultipleSeqAlignment
        if root_seq is None:  # use consensus
            af = calc_af(self.aln, nuc_alpha)
            root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)]
        if type(root_seq) == str and root_seq in self.sequence_lookup:
            root_seq = np.array(self.sequence_lookup[root_seq])
        if max_gaps < 1.0:
            af = calc_af(self.aln, nuc_alpha)
            good_pos = af[nuc_alpha.index('-')] < max_gaps
        else:
            good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool)
        date_vs_distance = {}
        self.reference_aln = None
        for seq in self.aln:
            date_vs_distance[seq.id] = (seq.attributes['num_date'],
                                        np.mean(
                                            (np.array(seq) !=
                                             root_seq)[(np.array(seq) != '-')
                                                       & (root_seq != '-')
                                                       & good_pos]))
            if seq.id == self.reference.id:
                self.reference_aln = seq
        date_vs_distance_array = np.array(date_vs_distance.values())
        from scipy.stats import linregress, scoreatpercentile
        slope, intercept, rval, pval, stderr = linregress(
            date_vs_distance_array[:, 0], date_vs_distance_array[:, 1])
        print("distance vs time regression:", slope)
        residuals = (intercept + slope * date_vs_distance_array[:, 0]
                     ) - date_vs_distance_array[:, 1]
        IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(
            residuals, 25)
        if plot:
            import matplotlib.pyplot as plt
            plt.ion()
            plt.scatter(date_vs_distance_array[:, 0],
                        date_vs_distance_array[:, 1],
                        c='g')
            bad_points = abs(intercept + slope * date_vs_distance_array[:, 0] -
                             date_vs_distance_array[:, 1]) > n_iqd * IQD
            plt.scatter(date_vs_distance_array[bad_points, 0],
                        date_vs_distance_array[bad_points, 1],
                        c='r')

        print("before clock filter:", len(self.aln))
        tmp = {
            seq.id: seq
            for seq in self.aln
            if abs(intercept + slope * date_vs_distance[seq.id][0] -
                   date_vs_distance[seq.id][1]) < n_iqd * IQD
        }
        if self.reference.id not in tmp and self.reference_aln is not None:
            print('adding reference again after clock filter')
            tmp[self.reference.id] = self.reference_aln
        self.aln = MultipleSeqAlignment(tmp.values())
        print("after clock filter:", len(self.aln))

    def subsample(self,
                  category=None,
                  priority=None,
                  threshold=None,
                  repeated=False,
                  forced_strains=[]):
        '''
        produce a useful set of sequences from the raw input.
        arguments:
        category  -- callable that assigns each sequence to a category for subsampling
        priority  -- callable that assigns each sequence a priority to be included in
                     the final sample. this is applied independently in each category
        threshold -- callable that determines the number of sequences from each category
                     that is included in the final set. takes arguments, cat and seq
                     alternatively can be an int
        forced_strains -- list of of strain names that should always be included (set to high priorty)
        '''
        # define filter criteria if not specified
        if category is None:
            category = lambda x: (x.attributes['date'].year, x.attributes[
                'date'].month)
        if priority is None:
            priority = lambda x: np.random.random()
        if threshold is None:
            threshold = lambda x: 5
        elif type(threshold) is int:
            print("using threshold:", threshold)
            tmp = threshold
            threshold = lambda x: tmp

        # if we do repeated subsampling, subsamples seqs, otherwise all_seqs
        self.sequence_categories = defaultdict(list)
        if repeated:
            seqs_to_subsample = self.seqs.values()
        else:
            seqs_to_subsample = self.all_seqs.values()

        # sort sequences into categories and assign priority score
        for seq in seqs_to_subsample:
            seq._priority = priority(seq)
            if seq.id in forced_strains:
                seq._priority = 1.0
            self.sequence_categories[category(seq)].append(seq)

        # sample and record the degree to which a category is under_sampled
        self.seqs = {}
        for cat, seqs in self.sequence_categories.iteritems():
            under_sampling = min(1.00, 1.0 * len(seqs) / threshold(cat))
            for s in seqs:
                s.under_sampling = under_sampling
            seqs.sort(key=lambda x: x._priority, reverse=True)
            self.seqs.update(
                {seq.id: seq
                 for seq in seqs[:threshold((cat, seqs))]})

    def align(self):
        '''
        align sequences using mafft
        '''
        from Bio import AlignIO
        from Bio.Align import MultipleSeqAlignment
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        ref_in_set = self.reference_seq.name in self.seqs
        if ref_in_set:
            out_seqs = self.seqs.values()
        else:
            out_seqs = self.seqs.values() + [self.reference_seq]
        print("align: reference in set", ref_in_set)
        SeqIO.write(out_seqs, "temp_in.fasta", "fasta")
        os.system("mafft --anysymbol --thread " + str(self.nthreads) +
                  " temp_in.fasta > temp_out.fasta")

        tmp_aln = AlignIO.read('temp_out.fasta', 'fasta')
        self.sequence_lookup = {seq.id: seq for seq in tmp_aln}
        # add attributes to alignment
        for seqid, seq in self.seqs.iteritems():
            self.sequence_lookup[seqid].attributes = seq.attributes
        self.aln = MultipleSeqAlignment([
            s for s in tmp_aln
            if s.name != self.reference_seq.name or ref_in_set
        ])
        os.chdir('..')
        remove_dir(self.run_dir)

    def codon_align(self, alignment_tool="mafft", prune=True, verbose=0):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO, SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        bad_seq = 0
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune == False:
                aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id)
                aa_seqs[seq.id].attributes = seq.attributes
            else:
                if verbose: print(seq.id, "has premature stops, discarding")
            bad_seq += '*' in str(tempseq)[:-1]

        print('Number of sequences with stops:', bad_seq, 'out of total',
              len(self.seqs))
        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname, 'fasta')

        if alignment_tool == 'muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname,
                                      out=tmpfname[:-5] + 'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta")
        elif alignment_tool == 'mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:', alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id: seq for seq in self.aln}
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)

    def strip_non_reference(self):
        ungapped = np.array(
            self.sequence_lookup[self.reference_seq.name]) != '-'
        from Bio.Seq import Seq
        for seq in self.aln:
            seq.seq = Seq("".join(np.array(seq)[ungapped]))

    def diversity_statistics(self):
        ''' calculate alignment entropy of nucleotide and optionally protein alignments '''
        if not hasattr(self, "aln"):
            print("calculate alignment first")
            return
        aln_array = np.array(self.aln)
        self.af = {'nuc': calc_af(self.aln, nuc_alpha)}
        tmp_af = self.af['nuc'][:-2] / self.af['nuc'][:-2].sum(axis=0)
        self.entropy = {'nuc': -(tmp_af * np.log(tmp_af + TINY)).sum(axis=0)}

        if hasattr(self, "translations"):
            for prot, aln in self.translations.iteritems():
                self.af[prot] = calc_af(aln, aa_alpha)
                tmp_af = self.af[prot][:-2] / self.af[prot][:-2].sum(axis=0)
                self.entropy[prot] = -(tmp_af *
                                       np.log(tmp_af + TINY)).sum(axis=0)

    def translate(self, proteins=None):
        '''
        make alignment of translations
        '''
        from Bio.SeqFeature import FeatureLocation
        from Bio.Seq import Seq
        from Bio.Align import MultipleSeqAlignment
        if not hasattr(
                self, "proteins"
        ):  # generate dictionaries to hold annotation and translation
            self.translations = {}
            self.proteins = {}

        # add a default translation of the entire sequence unless otherwise specified
        if proteins is None and len(self.proteins) == 0:
            self.proteins.update({
                'cds':
                FeatureLocation(start=0,
                                end=self.aln.get_alignment_length(),
                                strand=1)
            })
        else:
            self.proteins.update(proteins)

        for prot in self.proteins:
            aa_seqs = []
            for seq in self.aln:
                try:
                    # soon not needed as future biopython version will translate --- into -
                    tmpseq = self.proteins[prot].extract(seq)
                    tmpseq.attributes = seq.attributes
                    tmpseq.seq = Seq(
                        str(
                            Seq(str(tmpseq.seq).replace(
                                '---', 'NNN')).translate()).replace('X', '-'))
                except:
                    tmpseq.seq = Seq(
                        str(
                            Seq("".join([
                                x if x in 'ACGT' else 'N'
                                for x in str(tmpseq.seq)
                            ])).translate()).replace('X', '-'))
                    print("Trouble translating", seq.id)
                aa_seqs.append(tmpseq)
            self.translations[prot] = MultipleSeqAlignment(aa_seqs)

    def export_diversity(self, fname='entropy.json'):
        '''
        write the alignment entropy of each alignment (nucleotide and translations) to file
        '''
        if not hasattr(self, "entropy"):
            self.diversity_statistics()
        entropy_json = {}
        for feat in self.entropy:
            S = [max(0, round(x, 4)) for x in self.entropy[feat]]
            n = len(S)
            if feat == 'nuc':
                entropy_json[feat] = {
                    'pos': range(0, n),
                    'codon': [x // 3 for x in range(0, n)],
                    'val': S
                }
            else:
                entropy_json[feat] = {
                    'pos': [x for x in self.proteins[feat]][::3],
                    'codon': [(x - self.proteins[feat].start) // 3
                              for x in self.proteins[feat]][::3],
                    'val':
                    S
                }
        write_json(entropy_json, fname, indent=None)
Пример #38
0
class GenericAlign(object):
    """docstring for Align"""
    def __init__(self, input):
        self.input = input
        self.alignment = None
        self.trimmed_alignment = None
        self.perfect_trimmed_alignment = None

    def _clean(self, outtemp):
        if type(outtemp) is list:
            for f in outtemp:
                os.remove(f)
        else:
            os.remove(outtemp)
        # cleanup temp file
        try:
            os.remove(self.input)
        except:
            pass

    def _find_ends(self, forward=True):
        """determine the first (or last) position where all reads in an 
        alignment start/stop matching"""
        if forward:
            theRange = xrange(self.alignment.get_alignment_length())
        else:
            theRange = reversed(xrange(self.alignment.get_alignment_length()))
        for col in theRange:
            if '-' in self.alignment.get_column(col):
                pass
            else:
                break
        return col

    def _base_checker(self, bases, sequence, loc):
        """ensure that any trimming that occurs does not start beyong the
        end of the sequence being trimmed"""
        # deal with the case where we just want to measure out from the
        # middle of a particular sequence
        if len(loc) == 1:
            loc = (loc, loc)
        if not bases > len(sequence.seq[:loc[0]]) and \
            not bases > len(sequence.seq[loc[1]:]):
            return True

    def _record_formatter(self, temp):
        """return a string formatted as a biopython sequence record"""
        temp_record = SeqRecord(temp)
        return temp_record

    def _alignment_summary(self, alignment):
        """return summary data for an alignment object using the AlignInfo
        class from BioPython"""
        summary = AlignInfo.SummaryInfo(alignment)
        consensus = summary.dumb_consensus()
        return summary, consensus

    def _read(self, format):
        """read an alignment from the CLI - largely for testing purposes"""
        self.alignment = AlignIO.read(open(self.input, 'rU'), format)

    def get_probe_location(self):
        '''Pull the probe sequence from an alignment object and determine its position
        within the read'''
        # probe at bottom => reverse order
        for record in self.alignment[::-1]:
            if record.id == 'probe':
                start = re.search('^-*', str(record.seq))
                end = re.search('-*$', str(record.seq))
                # should be first record
                break
        # ooh, this seems so very backwards
        self.ploc = (start.end(), end.start(),)

    def running_average(self, window_size, threshold, proportion=0.3, k=None, running_probe=False):
        # iterate across the columns of the alignment and determine presence
        # or absence of base-identity in the column
        differences = []
        members = len(self.alignment)
        if not running_probe:
            for column in xrange(self.alignment.get_alignment_length()):
                column_values = self.alignment[:, column]
                # get the count of different bases in a column (converting
                # it to a set gets only the unique values)
                column_list = list(column_values)
                # use proportional removal of gaps
                if column_list.count('-') <= int(round(proportion * members, 0)):
                    column_list = [i for i in column_list if i != '-']
                #pdb.set_trace()
                if len(set(column_list)) > 1:
                    differences.append(0)
                else:
                    differences.append(1)
        else:
            for column in xrange(self.alignment.get_alignment_length()):
                column_values = list(self.alignment[:, column])
                # drop the index of the probe from the column_values
                del column_values[k]
                # get the count of different bases in a column (converting
                # it to a set gets only the unique values).
                #
                # no need to convert to a list here because it is already one
                if len(set(column_values)) > 1:
                    differences.append(0)
                else:
                    differences.append(1)
        differences = numpy.array(differences)
        weight = numpy.repeat(1.0, window_size) / window_size
        running_average = numpy.convolve(differences, weight)[window_size - 1:-(window_size - 1)]
        good = numpy.where(running_average >= threshold)[0]
        # remember to add window size onto end of trim
        try:
            start_clip, end_clip = good[0], good[-1] + window_size
        except IndexError:
            start_clip, end_clip = None, None
        return start_clip, end_clip

    def trim_alignment(self, method='edges', remove_probe=None, bases=None, consensus=True, window_size=20, threshold=0.5, proportion=0.3):
        """Trim the alignment"""
        if method == 'edges':
            # find edges of the alignment
            start = self._find_ends(forward=True)
            end = self._find_ends(forward=False)
        elif method == 'running':
            start, end = self.running_average(window_size, threshold, proportion=proportion)
        elif method == 'running-probe':
            # get position of probe
            for k, v in enumerate(self.alignment):
                if v.name == 'probe':
                    break
                else:
                    pass
            start, end = self.running_average(window_size, threshold, proportion, k, True)
        #pdb.set_trace()
        if method == 'notrim':
            self.trimmed_alignment = self.alignment
        else:
            # create a new alignment object to hold our alignment
            self.trimmed_alignment = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-"))
            for sequence in self.alignment:
                # ignore the probe sequence we added
                if (method == 'edges' or method == 'running' or method == 'running-probe') and not remove_probe:
                    # it is totally retarded that biopython only gives us the option to
                    # pass the Alignment object a name and str(sequence).  Given this
                    # level of retardation, we'll fudge and use their private method
                    if start >= 0 and end:
                        self.trimmed_alignment.append(sequence[start:end])
                    else:
                        self.trimmed_alignment = None
                        break
                elif method == 'static' and not remove_probe and bases:
                    # get middle of alignment and trim out from that - there's a
                    # weakness here in that we are not actually locating the probe
                    # region, we're just locating the middle of the alignment
                    mid_point = len(sequence) / 2
                    if self._base_checker(bases, sequence, mid_point):
                        self.trimmed_alignment._records.append(
                                sequence[mid_point - bases:mid_point + bases]
                            )
                    else:
                        self.trimmed_alignment = None
                elif method == 'static' and not remove_probe and bases and self.ploc:
                    # get middle of alignment and trim out from that - there's a
                    # weakness here in that we are not actually locating the probe
                    # region, we're just locating the middle of the alignment
                    if self._base_checker(bases, sequence, self.ploc):
                        self.trimmed_alignment._records.append(
                                sequence[self.ploc[0] - bases:self.ploc[1] + bases]
                            )
                    else:
                        self.trimmed_alignment = None
                elif remove_probe and self.ploc:
                    # we have to drop to sequence level to add sequence slices
                    # where we basically slice around the probes location
                    temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:]
                    self.trimmed_alignment._records.append( \
                            self._record_formatter(temp)
                        )
                elif method == 'static' and remove_probe and bases and self.ploc:
                    if self._base_checker(bases, sequence, self.ploc):
                        temp = sequence.seq[self.ploc[0] - bases:self.ploc[0]] + \
                                sequence.seq[self.ploc[1]:self.ploc[1] + bases]
                        self.trimmed_alignment._records.append( \
                                self._record_formatter(temp)
                            )
                    else:
                        self.trimmed_alignment = None
        # build a dumb consensus
        if consensus and self.trimmed_alignment:
            self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \
                self._alignment_summary(self.trimmed_alignment)
        if not self.trimmed_alignment:
            print "\tAlignment {0} dropped due to trimming".format(self.alignment._records[0].description.split('|')[1])

    def trim_ambiguous_bases(self):
        """snip ambiguous bases from a trimmed_alignment"""
        ambiguous_bases = []
        # do this by finding all ambiguous bases and then snipping the largest
        # chunk with no ambiguous bases from the entire alignment
        if not self.trimmed_alignment:
            self.perfect_trimmed_alignment = self.trimmed_alignment
        else:
            for column in xrange(0, self.trimmed_alignment.get_alignment_length()):
                if 'N' in self.trimmed_alignment[:,column]:
                    ambiguous_bases.append(column)
            maximum = 0
            maximum_pos = None
            #pdb.set_trace()
            if not ambiguous_bases:
                self.perfect_trimmed_alignment = self.trimmed_alignment
            if ambiguous_bases:
                # prepend and append the start and end of the sequence so consider
                # those chunks outside the stop and start of ambiguous base runs.
                ambiguous_bases.insert(0, 0)
                ambiguous_bases.append(self.trimmed_alignment.get_alignment_length() - 1)
                # create a new alignment object to hold our alignment
                self.perfect_trimmed_alignment = \
                    MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
                for pos in xrange(len(ambiguous_bases)):
                    if pos + 1 < len(ambiguous_bases):
                        difference = ambiguous_bases[pos + 1] - \
                            ambiguous_bases[pos]
                        if difference > maximum:
                            maximum = difference
                            maximum_pos = (pos, pos + 1)
                    else:
                        pass
                # make sure we catch cases where there is not best block
                if maximum_pos:
                    for sequence in self.trimmed_alignment:
                        self.perfect_trimmed_alignment.append(
                                sequence[ambiguous_bases[maximum_pos[0]] + 1:ambiguous_bases[maximum_pos[1]]]
                            )
                else:
                    self.perfect_trimmed_alignment = None
Пример #39
0
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"]

    ### input files are from s6
    genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/")

    ### mkdir output directory for s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    if os.path.isdir(output_directory) == False:
        os.makedirs(output_directory)

    ### iterate each gene
    for file in os.listdir(genes_result_s6):
        if file != ".DS_Store":
            output_directory_file = output_directory + file
            fasta_name = genes_result_s6 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " +sequence)

                alignment = AlignIO.read(sequence, 'fasta')
                # print(alignment)

                ### generate a new alignment sequences without outgroups.
                align = MultipleSeqAlignment([])

                for record in alignment:
                    if record.id not in outgroups:
                        # print(record.id)
                        # print(record.seq)
                        temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        # print(temp_seq)
                        align.extend([temp_seq])


                print(align)
                # print(align.get_alignment_length())


                total_wrong_poly_sites = []
                ### change alignment to an array.
                align_array = np.array([list(rec) for rec in align])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = align.get_alignment_length()



                ### using 20bp-long sliding windows.
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position) > float(Max_p_sites):
                        print(column_position)
                        total_wrong_poly_sites = total_wrong_poly_sites + column_position

                #print(total_wrong_poly_sites)

                ### generate the unique positions

                total_wrong_poly_sites = total_wrong_poly_sites + list(range(10))
                total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length))
                ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species.
                unique_wrong_sites = list(np.unique(total_wrong_poly_sites))
                print(len(unique_wrong_sites))
                # sum2 = alignment[:, total_length:total_length + 1]
                # for i in unique_wrong_sites:
                #     sum2 = sum2 + alignment[:, i:i+1]
                # print(sum2)
                # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip")


                ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites.
                ### otherwise, copy the gene to the new folder.
                if len(unique_wrong_sites) > 0:

                    print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}"))

                    cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")

                    cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col

                    print(cmd)
                    os.system(cmd)

                else:
                    cmd_2 = "cp " + fasta_name + " " + output_directory_file
                    print(cmd_2)
                    os.system(cmd_2)
Пример #40
0
                % (repr(given_alpha), t_filename)
        except ValueError:
            # Good - should fail
            pass
        h.close()
    del good, bad, given_alpha, base_alpha

    if t_alignment:
        print("Testing reading %s format file %s as an alignment"
              % (t_format, t_filename))

        alignment = MultipleSeqAlignment(SeqIO.parse(
            handle=t_filename, format=t_format))
        assert len(alignment) == t_count

        alignment_len = alignment.get_alignment_length()

        # Check the record order agrees, and double check the
        # sequence lengths all agree too.
        for i in range(t_count):
            assert compare_record(records[i], alignment[i])
            assert len(records[i].seq) == alignment_len

        print(alignment_summary(alignment))

    # Some alignment file formats have magic characters which mean
    # use the letter in this position in the first sequence.
    # They should all have been converted by the parser, but if
    # not reversing the record order might expose an error.  Maybe.
    records.reverse()
    check_simple_write_read(records)
Пример #41
0
def complete_from_consensus(true_seq_aln,
                            cod_align,
                            edited_pos,
                            gcode={},
                            only_ed_G=False):
    true_seq_aln = MultipleSeqAlignment(list(true_seq_aln))
    summary_align = AlignInfo.SummaryInfo(true_seq_aln)
    consensus = summary_align.dumb_consensus(threshold=0.5)
    for pos in range(true_seq_aln.get_alignment_length()):
        cons_aa = consensus[pos]
        known_ed_col = at_least_one_ed(edited_pos, pos)
        if known_ed_col is not None:
            for seqrec in true_seq_aln:
                ctable = CodonTable.unambiguous_dna_by_id[gcode.get(
                    seqrec.name, 1)]
                seq_aa = seqrec[pos]
                ed_allowed = (only_ed_G and len(edited_pos.get(
                    seqrec.name, [])) > 0) or not only_ed_G
                nuc_pos = len(str(seqrec[:pos + 1].seq).replace("-", ""))
                in_pos = [
                    x[-1] for x in edited_pos[seqrec.name] if x[-1] // 3 == pos
                ]
                if in_pos:
                    wg_pos = in_pos[0]
                    # Here we attempt to slightly correct a wrong position
                    wg_cod = [
                        x for x in str(cod_align[seqrec.name][pos * 3:pos * 3 +
                                                              3].seq)
                    ]
                    if not wg_cod[wg_pos % 3] == 'C':
                        edited_pos[seqrec.name] = [
                            x for x in edited_pos[seqrec.name] if x != wg_pos
                        ]
                    else:
                        wg_cod[wg_pos % 3] = 'T'
                        wg_aa = ctable.forward_table.get("".join(wg_cod), 'X')
                        if wg_aa != 'X' and wg_aa != cons_aa:
                            edited_pos[seqrec.name] = [
                                x for x in edited_pos[seqrec.name]
                                if x != wg_pos
                            ]

                    #print(seqrec.description, pos, pos*3, ed_allowed, known_ed_col, cons_aa, seq_aa, str(cod_align[seqrec.name][pos*3: pos*3+3].seq))

                if ed_allowed and not in_pos:
                    codon = [
                        x for x in str(cod_align[seqrec.name][pos * 3:pos * 3 +
                                                              3].seq)
                    ]
                    cmut = [x for x in codon]
                    cmut[known_ed_col] = 'T'
                    if codon[known_ed_col] == 'C' and ctable.forward_table.get(
                            "".join(cmut), 'X') == cons_aa and cons_aa != 'X':
                        edited_pos[seqrec.name].append(
                            (nuc_pos * 3 + known_ed_col,
                             pos * 3 + known_ed_col))
                    else:
                        editerator = editing_yielder(codon)
                        while True:
                            try:
                                codmut = next(editerator)
                                if codmut and ctable.forward_table.get(
                                        codmut, 'X') == cons_aa:
                                    edited_pos[seqrec.name].extend([
                                        (npos + nuc_pos * 3, pos * 3 + npos)
                                        for npos, mnuc in enumerate(codmut)
                                        if codon[npos] != mnuc
                                    ])
                                    break
                            except StopIteration:
                                break

    for k, v in edited_pos.items():
        edited_pos[k] = list(sorted(set(v)))
    return edited_pos