def test_run_blossum(self):
        pairs_to_result = {
            ('ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN', 'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT'):
            4,
            ('ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN', 'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPLASQNRVEVLA'):
            37,
            ('ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN', 'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD'):
            -4,
            ('RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT', 'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPLASQNRVEVLA'):
            3,
            ('RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT', 'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD'):
            9,
            ('ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPLASQNRVEVLA', 'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD'):
            24
        }

        sequence_file = '../data/guideline_tests/needlemanwunsch.fa'
        sequences = parse_fasta_files([sequence_file])
        # init the needleman
        settings = ScoringSettings(substitution_matrix=MatrixInfo.blosum62,
                                   gap_penalty=6,
                                   similarity=True)
        nw = NeedlemanWunsch(settings, complete_traceback=False, verbose=False)
        results = nw.pairwise_alignments(sequences)
        for result in results:
            seqs = (str(result.seq1.seq), str(result.seq2.seq))
            expected_score = pairs_to_result[seqs]
            self.assertEqual(result.score, expected_score)
Exemplo n.º 2
0
 def run(self, sequences):
     """
     Run function for feng doolittle.
     :param sequences: a list of SeqRecords
     :return: MultiAlignment object
     """
     # perform pairwise sequence alignments
     nw = NeedlemanWunsch(settings=ScoringSettings(), verbose=self.verbose)
     alignments = nw.pairwise_alignments(sequences)
     alignments = sorted(alignments, key=lambda x: x.score, reverse=True)
     LOGGER.info("Needleman Wunsch Alignments:\n%s" % "\n".join([str(x) for x in alignments]))
     # Convert the scores to approximate pairwise evolutionary distances.
     for alignment in alignments:
         if self.similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE or \
                 self.similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE_EXTENDED:
             alignment.score = self.convert_to_evolutionary_distances(alignment, self.similarity_scoring_method,
                                                                      self.nw_settings)
         elif self.similarity_scoring_method == SimilarityScoringMethod.PURE_ALIGNMENT:
             alignment.score *= -1
         else:
             raise NotImplementedError(
                     f'similarity_scoring_method {self.similarity_scoring_method} not supported/implemented.')
     # 2. Construct a guide tree
     # init the xpgma
     xpgma = Xpgma(clustering_method=self.clustering_method)
     tree = xpgma.run(alignments)
     # 3. Start from the root of the tree to compute MSA.
     msa = self.compute_msa(tree)
     res_str = f'Tree: {tree}\n' + "\n".join([x.seq for x in msa.sequences])
     LOGGER.info(f'Tree: {tree}')
     LOGGER.info("GENERATED MSA:\nSCORE:%f\nMSA:\n\n%s" % (msa.score, res_str))
     return msa
Exemplo n.º 3
0
 def compute_best_alignment_many_to_many(self, alignment1: MultiAlignment, alignment2: MultiAlignment):
     """
     Function which finds the best alignment, by calculating alignment between two lists of sequences.
     :param alignment1: MultiAlignment object
     :param alignment2: MultiAlignment object
     :return: best_alignment, index in alignment1, index in alignment2, best_score, overall_score
     """
     best_alignment = None
     index1 = None
     index2 = None
     best_score = None
     overall_score = 0
     sequences1 = alignment1.sequences
     sequences2 = alignment2.sequences
     nw = NeedlemanWunsch(settings=self.nw_settings)
     for i, seq1 in enumerate(sequences1):
         for j, seq2 in enumerate(sequences2):
             result = nw.run(seq1, seq2)
             if best_score is None or result.score > best_score:
                 best_score = result.score
                 best_alignment = result.alignments[0]
                 index1 = i
                 index2 = j
             # the score is the addition of all pairwise scores.
             overall_score += result.score
     return [best_alignment.sequence1, best_alignment.sequence2], index1, index2, best_score, overall_score
 def test_init(self):
     nw = NeedlemanWunsch()
     nw.init_scoring_matrix("AAAC", "AAAC")
     assert np.array_equal(
         nw.scoring_matrix,
         np.array([[0., -6., -12., -18., -24.], [-6., 0., 0., 0., 0.],
                   [-12., 0., 0., 0., 0.], [-18., 0., 0., 0., 0.],
                   [-24., 0., 0., 0., 0.]]))
 def test_calculate_guide_tree(self):
     nw = NeedlemanWunsch()
     sequences = utils.parse_fasta_files(["../data/xpgma/xpgma1.fa"])
     alignments = nw.pairwise_alignments(sequences)
     xpgma = Xpgma()
     xpgma.create_distance_matrix(alignments)
     guidetree = xpgma.calculate_guide_tree()
     expected = '((A:2.00,B:2.00):0.00,C:2.00)'
     expected_nodes = "{'A': A:2.00, 'B': B:2.00, 'C': C:2.00, 'AB': (A:2.00,B:2.00):0.00, 'ABC': ABC:NONE}"
     self.assertEqual(str(guidetree), expected)
     self.assertEqual(str(guidetree.nodes), expected_nodes)
    def __init_2d_needleman_tables(self):
        """
        Computes two-dimensional Needleman-Wunsch to create the faces of the three-dimensional Needleman-Wunsch matrix.
        """
        AlignmentOutputData.table_values_xy = NeedlemanWunsch(). \
            get_new_table(self._data.cost_function, self._data.gap_cost, self._data.sequence_c, self._data.sequence_b)

        AlignmentOutputData.table_values_xz = NeedlemanWunsch(). \
            get_new_table(self._data.cost_function, self._data.gap_cost, self._data.sequence_c, self._data.sequence_a)

        AlignmentOutputData.table_values_yz = NeedlemanWunsch(). \
            get_new_table(self._data.cost_function, self._data.gap_cost, self._data.sequence_b, self._data.sequence_a)
Exemplo n.º 7
0
def run_xpgma():
    sequences = parse_input(args.input, args.file_filter)
    # perform pairwise sequence alignments
    nw = NeedlemanWunsch(verbose=args.verbose)
    alignments = nw.pairwise_alignments(sequences)
    LOGGER.info("Needleman Wunsch Alignments:\n%s" %
                "\n".join([str(x) for x in alignments]))
    # init the xpgma
    xpgma = Xpgma(clustering_method=args.mode)
    # create a distance matrix.
    xpgma.create_distance_matrix(alignments)
    # calculate the guide tree
    xpgma.calculate_guide_tree()
 def test_convert_to_evolutionary_distances(self):
     # perform pairwise sequence alignments
     nw = NeedlemanWunsch()
     sequences = utils.parse_fasta_files(
         ["../data/feng_test/conversion.fa"])
     alignments = nw.pairwise_alignments(sequences)
     feng = FengDoolittle()
     # Convert the scores to approximate pairwise evolutionary distances.
     alignment = alignments[0]
     print(f'Alignment: {alignment} ')
     alignment.score = feng.convert_to_evolutionary_distances(alignment)
     print(f'Score: {alignment.score} ')
     self.assertAlmostEqual(first=2.70805020110221, second=alignment.score)
def test_guideline_blosum():
    """Test cases given on the guideline from 04.02.2019
    """
    nw = NeedlemanWunsch()

    result, info = nw.run("data/xpgma_guideline.fasta",
                          "data/xpgma_guideline.fasta", "data/blosum62.txt",
                          False, 6, True)

    # the results is a upper triangle matrix of shape n x n.
    seq1_seq2 = result[0][1]
    assert seq1_seq2[3] == 4
    assert len(seq1_seq2[2]) == 8
    assert seq1_seq2[2][0] == (
        'ILDMDVVEGSAARFDCKVEG_YPDPEVMWFKDDNP__V_KESRHFQIDYDEEGN',
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHF_V__SQT_T')

    seq1_seq3 = result[0][2]
    assert seq1_seq3[3] == 37
    assert len(seq1_seq3[2]) == 4
    assert seq1_seq3[2][0] == (
        'ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN',
        'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPL_ASQN_RVEV__LA_')

    seq1_seq4 = result[0][3]
    assert seq1_seq4[3] == -4
    assert len(seq1_seq4[2]) == 1
    assert seq1_seq4[2][0] == (
        'ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN',
        'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD____')

    seq2_seq3 = result[1][2]
    assert seq2_seq3[3] == 3
    assert len(seq2_seq3[2]) == 1
    assert seq2_seq3[2][0] == (
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT',
        'ISDTEADIGSNLRWGC_AAAGKPRPMVRWLRNGEP__LASQNR__VEVLA')

    seq2_seq4 = result[1][3]
    assert seq2_seq4[3] == 9
    assert len(seq2_seq4[2]) == 2
    assert seq2_seq4[2][0] == (
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT',
        'RRLIPAARGGEISILCQPRA_APKATILW__SKGTEILGNSTRVTVT_SD')

    seq3_seq4 = result[2][3]
    assert seq3_seq4[3] == 24
    assert len(seq3_seq4[2]) == 1
    assert seq3_seq4[2][0] == (
        'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPLASQNRVEVLA_',
        'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD')
Exemplo n.º 10
0
def test_example_invalid_characters_fail():
    """This function does a negative test: it checks if it fails when it is supposed to.
    The reason for failure is non-amino acid characters in file 2 (error code 12)"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'Invalid_characters.fasta')
    with pytest.raises(SystemExit) as InvalidCharactersException:
        result = nw.run(seq_fasta_2, seq_fasta_1, 'pam250', -8, False)
        (id_seq1, seq1, id_seq2, seq2, score, alignments,
         num_alignments) = result

        assert InvalidCharactersException.type == SystemExit
        assert InvalidCharactersException.code == 12
Exemplo n.º 11
0
def test_example_invalid_format_fail():
    """This function does a negative test: it checks if it fails when it is supposed to.
    The reason for the failure is invalid file format: the first line does not start with >"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'Invalid_format.fasta')
    with pytest.raises(SystemExit) as InvalidFileException:
        result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', -8, False)
        (id_seq1, seq1, id_seq2, seq2, score, alignments,
         num_alignments) = result

        assert InvalidFileException.type == SystemExit
        assert InvalidFileException.code == 1
def test_example():
    """Example testing the dummy implementation."""

    nw = NeedlemanWunsch()
    result = nw.run("data/sequence1.fa", "data/sequence2.fa",
                    "data/blosum62.txt", 5, False)
    (id_seq1, seq1, id_seq2, seq2, score, alignments) = result

    assert id_seq1 == "idA"
    assert id_seq2 == "idB"
    assert seq1 == "FancySequenceA"
    assert seq2 == "FancysequenceB"
    assert score == 1000
    assert alignments[0] == ("Fancy_SequenceA_", "Fancys_equence_B")
def test_example_distance():
    """Test using distance scoring function"""

    nw = NeedlemanWunsch()
    result, info = nw.run("data/sequence1.fasta", "data/sequence2.fasta",
                          "data/test_scoring_distance.txt", True, 1, True)

    assert result[0][0][0].id == "idA"
    assert result[0][0][1].id == "idB"
    assert str(result[0][0][0].seq) == "TCCGA"
    assert str(result[0][0][1].seq) == "TACGCAGA"
    assert result[0][0][3] == -2
    assert len(result[0][0][2]) == 1
    assert result[0][0][2][0] == ("T_C_C_GA", "TACGCAGA")
Exemplo n.º 14
0
 def pairwiseAlignment(self, s1, s2, subsMat, gapOpenCost):
     # Get the similarity using the Needleman Wunsch algorithm
     nw = NeedlemanWunsch()
     (traceback, optimalScore) = nw.buildMatrices(s1, s2, subsMat,
                                                  gapOpenCost)
     alignment_strings = nw.getAlignmentsFromTracebacks(s1, s2, traceback)
     num_alignments = len(alignment_strings)
     # Get only one random optimal alignment
     randomNum = random.randint(0, num_alignments - 1)
     alignment = alignment_strings[
         randomNum]  # alignment is a list: ['','','']
     distance = self.similarityToDistance(optimalScore, s1, s2, nw,
                                          alignment, subsMat, gapOpenCost)
     return distance
def test_example_similarity():
    """Test using similarity scoring function
    """

    nw = NeedlemanWunsch()

    result, info = nw.run("data/sequence1.fasta", "data/sequence2.fasta",
                          "data/test_scoring_similarity.txt", True, 1, True)

    assert result[0][0][0].id == "idA"
    assert result[0][0][1].id == "idB"
    assert str(result[0][0][0].seq) == "TCCGA"
    assert str(result[0][0][1].seq) == "TACGCAGA"
    assert result[0][0][3] == 4
    assert len(result[0][0][2]) == 6
    assert result[0][0][2][0] == ("__TCCGA_", "TACGCAGA")
Exemplo n.º 16
0
def test_too_few_arguments():
    """This function does a negative test: it checks if it fails when it is supposed to.
    The reason for failure is non-amino acid characters in file 2 (error code 12)"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'seq2.fasta')
    # test is a variable which becomes True when there are too few arguments
    test = False
    try:
        with pytest.raises(SystemExit) as TooFewArguments:
            result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', False)
            (id_seq1, seq1, id_seq2, seq2, score, alignments,
             num_alignments) = result
    # A TypeError is thrown when there are too few arguments (we are missing 1 argument)
    except TypeError:
        test = True
    assert test == True
Exemplo n.º 17
0
 def compute_best_alignment_one_to_many(self, leaf: Node, alignment: MultiAlignment):
     """
     Function which finds the best alignment, by calculating alignments between a sequence and many sequences.
     :param leaf: Node object which is a leaf
     :param alignment: MultiAlignment object
     :return: alignment, index of best alignment, alignment score.
     """
     assert leaf.is_leaf()
     best_alignment = None
     index = None
     best_score = None
     leaf_sequence = leaf.sequence
     sequences = alignment.sequences
     nw = NeedlemanWunsch(settings=self.nw_settings)
     for i, seq in enumerate(sequences):
         result = nw.run(leaf_sequence, seq)
         if best_score is None or result.score > best_score:
             best_score = result.score
             best_alignment = result.alignments[0]
             index = i
     return [best_alignment.sequence1, best_alignment.sequence2], index, best_score
 def test_create_distance_matrix(self):
     nw = NeedlemanWunsch()
     sequences = utils.parse_fasta_files(["../data/xpgma/xpgma1.fa"])
     alignments = nw.pairwise_alignments(sequences)
     xpgma = Xpgma()
     xpgma.create_distance_matrix(alignments)
     self.assertDictEqual(
         xpgma.distances, {
             'A': {
                 'B': 4.0,
                 'C': 4.0
             },
             'B': {
                 'A': 4.0,
                 'C': 4.0
             },
             'C': {
                 'A': 4.0,
                 'B': 4.0
             }
         })
Exemplo n.º 19
0
    def convert_to_evolutionary_distances(pairwise_alignment_result: Result, similarity_scoring_method,
                                          nw_settings) -> float:
        """Converts similarity score from a pairwise alignment to a distance score
        using approximation algorithm

        D(a,b) = - log(S_{a,b}^{eff})
        S_{a,b}^{eff} = (S(a,b) - S_{rand}) / (S_{a,b}^{max} - S_{rand})
        S_{rand} = (1/|A|) * (sum_{x,y in \Sigma \times \Sigma} S(x,y) * N_a(x) * N_b(y)) + gaps(A) * S(-,*)
        S_{a,b}^{max} = (S(a,a) + S(b,b)) / 2
        """
        alignment = pairwise_alignment_result.alignments[0]
        LOGGER.info("Converting similarity to evolutionary distances.")
        LOGGER.info("Alignment: %s" % alignment)

        seq1 = copy.deepcopy(alignment.sequence1)
        seq1.seq = seq1.seq.replace("-", "")

        seq2 = copy.deepcopy(alignment.sequence2)
        seq2.seq = seq2.seq.replace("-", "")

        nw = NeedlemanWunsch(settings=nw_settings)
        s_ab = nw.run(seq1, seq2)
        s_aa = nw.run(seq1, seq1)
        s_bb = nw.run(seq2, seq2)

        s_max = (s_aa.score + s_bb.score) / 2

        if similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE_EXTENDED:
            s_rand = (1 / len(alignment.sequence1)) * \
                     sum([nw.score(nw.alphabet.letters[i],
                                   nw.alphabet.letters[j])
                          * count_occurences_symbol_in_word(seq1.seq, nw.alphabet.letters[i])
                          * count_occurences_symbol_in_word(seq2.seq, nw.alphabet.letters[j])
                          for i in range(len(nw.alphabet.letters)) for j in range(len(nw.alphabet.letters))]) \
                     + count_gaps_in_pairwise_alignment(alignment) * nw.gap_penalty
        elif similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE:
            # copy sequences to no permanently change them
            seq1_shuffled = copy.deepcopy(seq1)
            seq2_shuffled = copy.deepcopy(seq2)
            # shuffle letters.
            seq1_shuffled.seq = ''.join(random.sample(seq1.seq, len(seq1)))
            seq2_shuffled.seq = ''.join(random.sample(seq2.seq, len(seq2)))
            s_rand = (nw.run(seq1_shuffled, seq2_shuffled)).score
        else:
            raise NotImplementedError(
                    f'similarity_scoring_method {similarity_scoring_method} not supported/implemented.')
        # prevent division by zero.
        if s_max == s_rand:
            s_rand = s_rand - 0.0001

        s_eff = (s_ab.score - s_rand) / (s_max - s_rand)

        # negative values make no sense.
        if s_eff <= 0.0:
            score = 1
        else:
            score = - math.log(s_eff)
        LOGGER.info("New score: %.5f" % score)
        return score
Exemplo n.º 20
0
def test_example_success():
    """This calls the run method of the Needleman-Wunsch program
    and tests if it works as expected (positive test)"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'seq2.fasta')
    result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', -8, True)
    (id_seq1, seq1, id_seq2, seq2, score, alignments, num_alignments) = result
    print(alignments)

    assert id_seq1 == "ID1"
    assert id_seq2 == "ID2"
    assert seq1 == "ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN"
    assert seq2 == "RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT"
    assert score == 31
    assert alignments == [[
        'ILDMDVVEGSAARFDCKVEG-YPDPEVMWFKDDNPVKESRHFQIDYDEEGN',
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTD-GRHFVSQTT',
        ':::::::**::::::*:::: **:::::*:::::*:::::: :::::::::'
    ]]
    assert num_alignments == 1
Exemplo n.º 21
0
def test_similarity_to_distance_ext():
    scoring_matrix = ScoringMatrix("data/test_scoring_similarity.txt",
                                   is_distance_fn=False,
                                   cost_gap_open=1)
    nw = NeedlemanWunsch()
    fd = FengDoolittle()
    pairwise_alignment = ("__TCCGA_", "TACGCAGA")
    distance = similarity_to_distance_ext(nw, pairwise_alignment,
                                          scoring_matrix)

    count = count_gaps_in_pairwise_alignment(pairwise_alignment)
    assert count == 3

    # the right hand side was computed from hand and is - log(S_eff)
    assert distance == -math.log((2 - -14 / 8) / (6.5 - -14 / 8))
Exemplo n.º 22
0
    def operation1(self, leaf1: Node, leaf2: Node) -> MultiAlignment:
        """
        Compute best pairwise alignment,
        change occurences of gap symbol to X
        :param leaf1: Node object
        :param leaf2: Node object
        :return: MultiAlignment object

        >>> from Bio.SeqRecord import SeqRecord
        >>> feng = FengDoolittle()
        >>> res = feng.operation1(leaf1=Node(sequence=SeqRecord("AAACGA"),name=None, cost=None),\
                            leaf2=Node(sequence=SeqRecord("AAA"), name=None,cost=None))
        >>> res.sequences[0].seq
        'AAACGA'
        >>> res.sequences[1].seq
        'XAAXXA'
        """
        assert leaf1.is_leaf() and leaf2.is_leaf()
        nw = NeedlemanWunsch(settings=self.nw_settings)
        result = nw.run(leaf1.sequence, leaf2.sequence)
        multi_alignment = MultiAlignment(sequences=[result.alignments[0].sequence1, result.alignments[0].sequence2],
                                         score=result.score)
        multi_alignment.sequences = replace_with_neutral_symbol(multi_alignment.sequences)
        return multi_alignment
Exemplo n.º 23
0
    def alignAndCombineGroups(self, group1, group2):
        """This function aligns 2 groups/a sequence and a group/2 sequences"""
        nw = NeedlemanWunsch()
        gma = Xpgma()
        minDistAlignment = ["", "", ""]
        # distance is always between 0 and 1. So, 1.5 will always be larger than
        # any distance (it is just a random number larger than the accepted range
        # of values which has no special meaning.
        minDist = 1.5
        # Get the pairwise Needleman-Wunsch alignment between every pair of sequences,
        # and find the one with the minimum distance. Note: we get 1 random alignment
        # from Needleman Wunsch here. Note: similarity score is converted to distance
        # in the pairwiseAlignment function.
        for id1, seq1 in enumerate(group1):
            for id2, seq2 in enumerate(group2):
                dist, alignment = self.pairwiseAlignment(
                    seq1, seq2, self.subsMat, self.gapOpenCost)
                # We don't need the 3rd part of the alignment array, which indicates
                # whether it is a match or a mismatch
                del alignment[2]
                if dist < minDist:
                    minDist = dist
                    # minDistAlignment is the alignment with minimum distance.
                    # It is a list of 2 sequences.
                    minDistAlignment = alignment
                    minIndex1 = id1
                    minIndex2 = id2
        # Assign the alignments of the 2 sequences with min. distance back to their
        # original position in their respective groups.
        group1[minIndex1] = minDistAlignment[0]
        group2[minIndex2] = minDistAlignment[1]
        # After aligning one sequence in group 1 with one sequence in group 2,
        # we need to add gaps in the other sequences in the same group where
        # there is a gap in the minimum distance sequences.
        # Do this for group1
        for id1, seq1 in enumerate(group1):
            if id1 != minIndex1:
                changed1 = self.addEquivalentGaps(group1[minIndex1], seq1)
                group1[id1] = changed1
        # and for group2 as well.
        for id2, seq2 in enumerate(group2):
            if id2 != minIndex2:
                changed2 = self.addEquivalentGaps(group2[minIndex2], seq2)
                group2[id2] = changed2

        # Combine all the sequences into a single group (extend group 1).
        group1.extend(group2)
        return group1
    def test_scoring_blossum(self):
        """Testing score calculation using Blossum62 + Gap Penalty = 6"""

        print("######### Testing calculation of scoring matrix. ###########")
        nw = NeedlemanWunsch(
            ScoringSettings(substitution_matrix=MatrixInfo.blosum62,
                            gap_penalty=6))
        print("############# Case 1 ##############")
        print("############# START ##############")
        seq1 = "A"
        seq2 = "A"
        nw.calculate_scoring_matrix(seq1, seq2)
        print("SEQ1: %s" % seq1)
        print("SEQ2: %s" % seq2)
        print("RESULT:\n %s" % nw.scoring_matrix)
        np.testing.assert_array_equal(nw.scoring_matrix,
                                      np.array([[0., -6.], [-6., 4.]]))
        print("############# FINISH ##############")
        print("############# Case 2 ##############")
        print("############# START ##############")
        seq1 = "A"
        seq2 = "AT"
        nw.calculate_scoring_matrix(seq1, seq2)
        print("SEQ1: %s" % seq1)
        print("SEQ2: %s" % seq2)
        print("RESULT:\n %s" % nw.scoring_matrix)
        np.testing.assert_array_equal(
            nw.scoring_matrix, np.array([[0., -6., -12.], [-6., 4., -2.]]))
        print("############# FINISH ##############")
        print("############# Case 3 ##############")
        print("############# START ##############")
        seq1 = "ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN"
        seq2 = "RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT"
        nw.calculate_scoring_matrix(seq1, seq2)
        print("SEQ1: %s" % seq1)
        print("SEQ2: %s" % seq2)
        print("RESULT:\n %s" % pprint.pformat(nw.scoring_matrix))
        self.assertAlmostEqual(nw.scoring_matrix[-1][-1], 4.0)
        print("############# FINISH ##############")
    def run(self,
            seq_fasta_fn,
            subst_matrix_fn,
            is_distance_fn,
            cost_gap_open,
            metrict_conversion_type,
            clustering):
        """
        Calculate optimal alignment with Feng-Doolittle algorithm.

        Args:
            seq_fasta_fn: path to fasta file containing sequences
            subst_matrix_fn: path to substitution matrix
            is_distance_fn (bool): If True, handle scoring matrix as distance measure, else similarity measure
            cost_gap_open: cost to open a gap
            clustering: select clustering algorithm, either "UPGMA" or "WPGMA"

        Returns:
            tuple of
            (score: sum-of-pairs score of optimal alignment,
             [aln_seq1, aln_seq2, ...]: final alignment as list of strings
             [aln_seq1_id, aln_seq2_id, ...]: list of sequence ids in same order as aligned sequences)
        """

        xpgma = XPGMA()
        xpgma, _ = xpgma.run(seq_fasta_fn, subst_matrix_fn, is_distance_fn, cost_gap_open, metrict_conversion_type, clustering)

        nw = NeedlemanWunsch()

        scoring_matrix = ScoringMatrix(subst_matrix_fn, is_distance_fn, cost_gap_open)

        msa = self.compute_msa(xpgma, nw, scoring_matrix, metrict_conversion_type)

        sum_of_pairs = self.compute_sum_of_pairs_score(scoring_matrix, msa)

        return msa, sum_of_pairs
def main(args):
    path = None
    if args.path is not None:
        path = args.path
        if not os.path.isdir(path):
            os.mkdir(path)

    file_name = args.file_name

    alignment1, alignment2 = None, None
    if args.alignment1 is not None and args.alignment2 is not None:
        alignment1 = args.alignment1
        alignment2 = args.alignment2
        if os.path.isfile(alignment1):
            alignment = ''
            with open(alignment1, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    alignment += ''.join(c for c in line if c.isalpha())
            alignment1 = alignment
        if os.path.isfile(alignment2):
            alignment = ''
            with open(alignment2, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    alignment += ''.join(c for c in line if c.isalpha())
            alignment2 = alignment
    else:
        print("Missing one or more sequences to align with!")
        exit(0)

    delta = None
    score = None
    keys = None
    if args.delta is not None:
        delta, keys = read_delta(args.delta)
    else:
        if args.match is not None and args.mismatch is not None and args.gap is not None:
            score = {
                'match': args.match,
                'mismatch': args.mismatch,
                'gap': args.gap
            }
        if args.keys is not None:
            keys = args.keys.split(',')
            if '-' not in keys:
                keys.append('-')
        else:
            print("Symbols will be used by the sequences are missing!")
            exit(0)

    hs = NeedlemanWunsch(score, keys, delta)
    tracemalloc.start()
    start_time = time.time()
    score, alignments = hs.align(alignment1, alignment2)
    current, peak = tracemalloc.get_traced_memory()
    end_time = time.time()
    print(
        f"Current memory usage is {current / 10 ** 6}MB; Peak was {peak / 10 ** 6}MB"
    )
    tracemalloc.stop()
    elapsed_time = end_time - start_time
    if path is None:
        print("Best Alignment Score:", score)
        print("Sequence 1: ", alignments[0])
        print("Sequence 2: ", alignments[1])
        print("Alignment is done in %.4f seconds!" % elapsed_time)
    else:
        with open(os.path.join(path, file_name), 'w+') as f:
            f.write("Best Alignment Score: %s \n" % str(score))
            f.write("Sequence 1: %s \n" % alignments[0])
            f.write("Sequence 2: %s \n" % alignments[1])
        print("Alignment is done in %.4f seconds!" % elapsed_time)
        print("Result saved at %s" % (path + file_name))
Exemplo n.º 27
0
    def run(self, seq_fasta_fn, subst_matrix_fn, is_distance_fn, cost_gap_open,
            metrict_conversion_type, clustering):
        """
            Computes a XPGMA

            Args:
              seq_fasta_fn (str): The relative path to a fasta file
              subst_matrix_fn (str): The relative path to a scoring matrix file
              is_distance_fn (bool): If True, handle scoring matrix as distance measure, else similarity measure
              cost_gap_open (int): gap cost open
              clustering (str): either "upgma" or "wpgma"

            Returns:
                new_cluster_node (Node): Root node of the XPGMA
                n
            """
        scoring_matrix = ScoringMatrix(subst_matrix_fn, is_distance_fn,
                                       cost_gap_open)
        seq_records = parse_fasta(seq_fasta_fn)
        seqs = [str(x.seq) for x in seq_records]

        # cluster distance matrix, containing pairwise distance information
        m_size = 2 * len(
            seqs) - 1  # additional len(seqs) - 1 rows when merging clusters
        m = [[0 for i in range(m_size)] for j in range(m_size)]

        # iterationlist, containing the matrix row/col indices of the current clusters
        # this is used to avoid having to clean the matrix after merge
        l = [i for i in range(len(seqs))]  # initially only singleton clusters

        # cluster distance matrix index to Node mapping
        initial_cluster = [
            Node(seq_records[i]) for i in range(len(seq_records))
        ]
        n = dict(zip(list(range(len(initial_cluster))), initial_cluster))

        # compute pairwise distances using NW
        # Note: no check if matrix is distance matrix
        nw = NeedlemanWunsch()

        result, info = nw.run(seq_fasta_fn, seq_fasta_fn, subst_matrix_fn,
                              is_distance_fn, cost_gap_open, False)

        # initialize cluster distance matrix with computed distances
        for i in range(len(seqs)):
            for j in range(i + 1, len(seqs)):
                if scoring_matrix.metric_type == MetricType.DISTANCE:
                    m[i][j] = result[i][j][3]
                elif metrict_conversion_type == 0:
                    m[i][j] = -result[i][j][3]
                elif metrict_conversion_type == 1:
                    m[i][j] = similarity_to_distance(nw, result[i][j][2][0],
                                                     scoring_matrix)
                elif metrict_conversion_type == 2:
                    m[i][j] == similarity_to_distance_ext(
                        nw, result[i][j][2][0], scoring_matrix)

        #print("m")
        #for i in range(len(seqs)):
        #    for j in range(len(seqs)):
        #        print("%3d" % (m[i][j]), end='')
        #    print()

        if clustering == "wpgma":
            return self.generate_wpgma(m, l, n)
        elif clustering == "upgma":
            return self.generate_upgma(m, l, n)
def test_instance():
    """Check inheritance."""
    assert issubclass(NeedlemanWunsch, NeedlemanWunschBase)
    assert isinstance(NeedlemanWunsch(), NeedlemanWunschBase)