Exemplo n.º 1
0
    def test_compute_score_and_traceback_matrices(self):
        # these results were computed manually
        expected_score_m = [[0, -5, -7, -9],
                            [-5, 2, -3, -5],
                            [-7, -3, 4, -1],
                            [-9, -5, -1, 6],
                            [-11, -7, -3, 1]]
        expected_tback_m = [[0, 3, 3, 3],
                            [2, 1, 3, 3],
                            [2, 2, 1, 3],
                            [2, 2, 2, 1],
                            [2, 2, 2, 2]]
        m = make_identity_substitution_matrix(2, -1)
        actual_score_m, actual_tback_m = _compute_score_and_traceback_matrices(
            TabularMSA([DNA('ACG', metadata={'id': 'id'})]),
            TabularMSA([DNA('ACGT', metadata={'id': 'id'})]), 5, 2, m)
        np.testing.assert_array_equal(actual_score_m, expected_score_m)
        np.testing.assert_array_equal(actual_tback_m, expected_tback_m)

        # different sequences
        # these results were computed manually
        expected_score_m = [[0, -5, -7, -9],
                            [-5, 2, -3, -5],
                            [-7, -3, 4, -1],
                            [-9, -5, -1, 3],
                            [-11, -7, -3, -2]]
        expected_tback_m = [[0, 3, 3, 3],
                            [2, 1, 3, 3],
                            [2, 2, 1, 3],
                            [2, 2, 2, 1],
                            [2, 2, 2, 1]]
        m = make_identity_substitution_matrix(2, -1)
        actual_score_m, actual_tback_m = _compute_score_and_traceback_matrices(
            TabularMSA([DNA('ACC', metadata={'id': 'id'})]),
            TabularMSA([DNA('ACGT', metadata={'id': 'id'})]), 5, 2, m)
        np.testing.assert_array_equal(actual_score_m, expected_score_m)
        np.testing.assert_array_equal(actual_tback_m, expected_tback_m)

        # four sequences provided in two alignments
        # these results were computed manually
        expected_score_m = [[0, -5, -7, -9],
                            [-5, 2, -3, -5],
                            [-7, -3, 4, -1],
                            [-9, -5, -1, 3],
                            [-11, -7, -3, -2]]
        expected_tback_m = [[0, 3, 3, 3],
                            [2, 1, 3, 3],
                            [2, 2, 1, 3],
                            [2, 2, 2, 1],
                            [2, 2, 2, 1]]
        m = make_identity_substitution_matrix(2, -1)
        actual_score_m, actual_tback_m = _compute_score_and_traceback_matrices(
            TabularMSA([DNA('ACC', metadata={'id': 's1'}),
                        DNA('ACC', metadata={'id': 's2'})]),
            TabularMSA([DNA('ACGT', metadata={'id': 's3'}),
                        DNA('ACGT', metadata={'id': 's4'})]), 5, 2, m)
        np.testing.assert_array_equal(actual_score_m, expected_score_m)
        np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
Exemplo n.º 2
0
    def test_make_identity_substitution_matrix(self):
        expected = {'A': {'A':  1, 'C': -2, 'G': -2, 'T': -2, 'U': -2},
                    'C': {'A': -2, 'C':  1, 'G': -2, 'T': -2, 'U': -2},
                    'G': {'A': -2, 'C': -2, 'G':  1, 'T': -2, 'U': -2},
                    'T': {'A': -2, 'C': -2, 'G': -2, 'T':  1, 'U': -2},
                    'U': {'A': -2, 'C': -2, 'G': -2, 'T': -2, 'U':  1}}
        self.assertEqual(make_identity_substitution_matrix(1, -2), expected)

        expected = {'A': {'A':  5, 'C': -4, 'G': -4, 'T': -4, 'U': -4},
                    'C': {'A': -4, 'C':  5, 'G': -4, 'T': -4, 'U': -4},
                    'G': {'A': -4, 'C': -4, 'G':  5, 'T': -4, 'U': -4},
                    'T': {'A': -4, 'C': -4, 'G': -4, 'T':  5, 'U': -4},
                    'U': {'A': -4, 'C': -4, 'G': -4, 'T': -4, 'U':  5}}
        self.assertEqual(make_identity_substitution_matrix(5, -4), expected)
Exemplo n.º 3
0
    def test_nucleotide_aligners_use_substitution_matrices(self):
        alt_sub = make_identity_substitution_matrix(10, -10)
        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with local alignment
        actual_no_sub = local_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4)
        actual_alt_sub = local_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4,
            substitution_matrix=alt_sub)
        self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0]))
        self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1]))
        self.assertNotEqual(actual_no_sub.score(),
                            actual_alt_sub.score())

        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with global alignment
        actual_no_sub = local_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4)
        actual_alt_sub = global_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4,
            substitution_matrix=alt_sub)
        self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0]))
        self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1]))
        self.assertNotEqual(actual_no_sub.score(),
                            actual_alt_sub.score())
Exemplo n.º 4
0
 def test_compute_score_and_traceback_matrices_invalid(self):
     # if the sequence contains a character that is not in the
     # substitution matrix, an informative error should be raised
     m = make_identity_substitution_matrix(2, -1)
     self.assertRaises(ValueError, _compute_score_and_traceback_matrices,
                       Alignment([DNA('AWG')]),
                       Alignment([DNA('ACGT')]), 5, 2, m)
Exemplo n.º 5
0
def getStartPosMapper(seq, subst=None):
    """Factory that returns a function to align peptides to seq.
    Can be used as the mapping function for a peptide column
    in a DataFrame, to align the column to a reference sequence

    Parameters
    ----------
    seq : str
        AA sequence.
    subst : dict of dicts
        Scores for each pair of AAs in peptide and sequence.

    Returns
    -------
    findPos : function
        Function with one argument: a peptide sequence to align."""
    if subst is None:
        subst = make_identity_substitution_matrix(1, -1, alphabet=AALPHABET)
    def findPos(pep):
        d = ssw(pep)
        return int(d['query_begin'] - d['target_begin'])
    
    ssw = StripedSmithWaterman(query_sequence=seq,
                               protein=True,
                               substitution_matrix=subst)
    return findPos
Exemplo n.º 6
0
    def test_compute_substitution_score(self):
        # these results were computed manually
        subs_m = make_identity_substitution_matrix(5, -4)
        self.assertEqual(
            _compute_substitution_score(['A'], ['A'], subs_m, 0), 5.0)
        self.assertEqual(
            _compute_substitution_score(['A', 'A'], ['A'], subs_m, 0), 5.0)
        self.assertEqual(
            _compute_substitution_score(['A', 'C'], ['A'], subs_m, 0), 0.5)
        self.assertEqual(
            _compute_substitution_score(['A', 'C'], ['A', 'C'], subs_m, 0),
            0.5)
        self.assertEqual(
            _compute_substitution_score(['A', 'A'], ['A', '-'], subs_m, 0),
            2.5)
        self.assertEqual(
            _compute_substitution_score(['A', 'A'], ['A', '-'], subs_m, 1), 3)

        # alt subs_m
        subs_m = make_identity_substitution_matrix(1, -2)
        self.assertEqual(
            _compute_substitution_score(['A', 'A'], ['A', '-'], subs_m, 0),
            0.5)
Exemplo n.º 7
0
    def test_global_pairwise_align_custom_alphabet_nondegenerate_chars(self):
        custom_substitution_matrix = make_identity_substitution_matrix(
            1, -1, alphabet=CustomSequence.nondegenerate_chars)

        custom_msa, custom_score, custom_start_end = global_pairwise_align(
            CustomSequence("WXYZ"), CustomSequence("WXYYZZ"),
            10.0, 5.0, custom_substitution_matrix)

        # Expected values computed by running an equivalent alignment using the
        # DNA alphabet with the following mapping:
        #
        #     W X Y Z
        #     | | | |
        #     A C G T
        #
        self.assertEqual(custom_msa, TabularMSA([CustomSequence('WXYZ^^'),
                                                 CustomSequence('WXYYZZ')]))
        self.assertEqual(custom_score, 2.0)
        self.assertEqual(custom_start_end, [(0, 3), (0, 5)])
Exemplo n.º 8
0
    def test_local_pairwise_align_custom_alphabet(self):
        custom_substitution_matrix = make_identity_substitution_matrix(
            5, -4, alphabet=CustomSequence.definite_chars)

        custom_msa, custom_score, custom_start_end = local_pairwise_align(
            CustomSequence("YWXXZZYWXXWYYZWXX"),
            CustomSequence("YWWXZZZYWXYZWWX"), 5.0, 0.5,
            custom_substitution_matrix)

        # Expected values computed by running an equivalent alignment using the
        # DNA alphabet with the following mapping:
        #
        #     W X Y Z
        #     | | | |
        #     A C G T
        #
        self.assertEqual(
            custom_msa,
            TabularMSA([CustomSequence('WXXZZYWXXWYYZWXX'),
                        CustomSequence('WXZZZYWX^^^YZWWX')]))
        self.assertEqual(custom_score, 41.0)
        self.assertEqual(custom_start_end, [(1, 16), (2, 14)])
Exemplo n.º 9
0
    def test_nucleotide_aligners_use_substitution_matrices(self):
        alt_sub = make_identity_substitution_matrix(10, -10)
        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with local alignment
        msa_no_sub, score_no_sub, start_end_no_sub = \
            local_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4)

        msa_alt_sub, score_alt_sub, start_end_alt_sub = \
            local_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4, substitution_matrix=alt_sub)

        self.assertNotEqual(msa_no_sub, msa_alt_sub)
        self.assertNotEqual(score_no_sub, score_alt_sub)
        self.assertNotEqual(start_end_no_sub, start_end_alt_sub)

        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with global alignment
        msa_no_sub, score_no_sub, start_end_no_sub = \
            global_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4)

        msa_alt_sub, score_alt_sub, start_end_alt_sub = \
            global_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4, substitution_matrix=alt_sub)

        self.assertNotEqual(msa_no_sub, msa_alt_sub)
        self.assertNotEqual(score_no_sub, score_alt_sub)
        self.assertEqual(start_end_no_sub, start_end_alt_sub)
Exemplo n.º 10
0
    def test_make_identity_substitution_matrix(self):
        expected = {
            'A': {
                'A': 1,
                'C': -2,
                'G': -2,
                'T': -2,
                'U': -2
            },
            'C': {
                'A': -2,
                'C': 1,
                'G': -2,
                'T': -2,
                'U': -2
            },
            'G': {
                'A': -2,
                'C': -2,
                'G': 1,
                'T': -2,
                'U': -2
            },
            'T': {
                'A': -2,
                'C': -2,
                'G': -2,
                'T': 1,
                'U': -2
            },
            'U': {
                'A': -2,
                'C': -2,
                'G': -2,
                'T': -2,
                'U': 1
            }
        }
        self.assertEqual(make_identity_substitution_matrix(1, -2), expected)

        expected = {
            'A': {
                'A': 5,
                'C': -4,
                'G': -4,
                'T': -4,
                'U': -4
            },
            'C': {
                'A': -4,
                'C': 5,
                'G': -4,
                'T': -4,
                'U': -4
            },
            'G': {
                'A': -4,
                'C': -4,
                'G': 5,
                'T': -4,
                'U': -4
            },
            'T': {
                'A': -4,
                'C': -4,
                'G': -4,
                'T': 5,
                'U': -4
            },
            'U': {
                'A': -4,
                'C': -4,
                'G': -4,
                'T': -4,
                'U': 5
            }
        }
        self.assertEqual(make_identity_substitution_matrix(5, -4), expected)
Exemplo n.º 11
0
import pandas as pd
import argparse
import re
import skbio
from copy import deepcopy
import skbio
from skbio.alignment import local_pairwise_align_ssw, make_identity_substitution_matrix
from skbio.sequence import Protein
ident = make_identity_substitution_matrix(match_score=1, mismatch_score=0, alphabet=skbio.sequence.Protein.alphabet)

def assembleOverlappingPeptides(pepArr,overlap=11):
    """This is a work in progress, but the idea was
    to be able to rebuild the sequence from the set of
    overlapping 15mers..."""
    assembled = [pep for pep in pepArr]
    while len(assembled)>1:
        for pepi1, pepi2 in itertools.combinations(arange(len(assembled)), 2):
            pep1, pep2 = assembled[pepi1], assembled[pepi2]
            res = pairwise2.align.globalxs(pep2, pep1, -4, 0)[0]
            #print res[2]
            if res[2]>=overlap-8:
                #print res[0]
                #print res[1]

                _ = assembled.pop(pepi2)
                assembled[pepi1] = ''.join([aa1 if not aa1=='-' else aa2 for aa1, aa2 in zip(res[0], res[1])])
                #print assembled[pepi1]
                #print
                break
    return assembled[0]
Exemplo n.º 12
0
        string = str(t) + str(pc)
        pc += 1
        hash_val = hashlib.sha256(string.encode())
        digest = bytearray(hash_val.digest())
        #   k - 1 dummy ops
        tmp = []
        for i in range(0, k - 1):
            tmp.append((digest[i] % 4) + 1)
        idx = digest[20] % k
        # tmp.insert(idx, t[0])
        tmp.insert(idx, t)
        x.extend(tmp)
    return x


matrix = make_identity_substitution_matrix(2, 1, '0123456789')
calibrate(matrix)

for i in x:
    # ops.add(i[0])
    ops.add(i)

print("Orig:")
for op in x:
    print(ops_dic[op], end=', ')
print("")


def list_to_str(a):
    return ''.join(map(str, a))