예제 #1
0
    def convert_to_evolutionary_distances(pairwise_alignment_result: Result, similarity_scoring_method,
                                          nw_settings) -> float:
        """Converts similarity score from a pairwise alignment to a distance score
        using approximation algorithm

        D(a,b) = - log(S_{a,b}^{eff})
        S_{a,b}^{eff} = (S(a,b) - S_{rand}) / (S_{a,b}^{max} - S_{rand})
        S_{rand} = (1/|A|) * (sum_{x,y in \Sigma \times \Sigma} S(x,y) * N_a(x) * N_b(y)) + gaps(A) * S(-,*)
        S_{a,b}^{max} = (S(a,a) + S(b,b)) / 2
        """
        alignment = pairwise_alignment_result.alignments[0]
        LOGGER.info("Converting similarity to evolutionary distances.")
        LOGGER.info("Alignment: %s" % alignment)

        seq1 = copy.deepcopy(alignment.sequence1)
        seq1.seq = seq1.seq.replace("-", "")

        seq2 = copy.deepcopy(alignment.sequence2)
        seq2.seq = seq2.seq.replace("-", "")

        nw = NeedlemanWunsch(settings=nw_settings)
        s_ab = nw.run(seq1, seq2)
        s_aa = nw.run(seq1, seq1)
        s_bb = nw.run(seq2, seq2)

        s_max = (s_aa.score + s_bb.score) / 2

        if similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE_EXTENDED:
            s_rand = (1 / len(alignment.sequence1)) * \
                     sum([nw.score(nw.alphabet.letters[i],
                                   nw.alphabet.letters[j])
                          * count_occurences_symbol_in_word(seq1.seq, nw.alphabet.letters[i])
                          * count_occurences_symbol_in_word(seq2.seq, nw.alphabet.letters[j])
                          for i in range(len(nw.alphabet.letters)) for j in range(len(nw.alphabet.letters))]) \
                     + count_gaps_in_pairwise_alignment(alignment) * nw.gap_penalty
        elif similarity_scoring_method == SimilarityScoringMethod.SCORE2DISTANCE:
            # copy sequences to no permanently change them
            seq1_shuffled = copy.deepcopy(seq1)
            seq2_shuffled = copy.deepcopy(seq2)
            # shuffle letters.
            seq1_shuffled.seq = ''.join(random.sample(seq1.seq, len(seq1)))
            seq2_shuffled.seq = ''.join(random.sample(seq2.seq, len(seq2)))
            s_rand = (nw.run(seq1_shuffled, seq2_shuffled)).score
        else:
            raise NotImplementedError(
                    f'similarity_scoring_method {similarity_scoring_method} not supported/implemented.')
        # prevent division by zero.
        if s_max == s_rand:
            s_rand = s_rand - 0.0001

        s_eff = (s_ab.score - s_rand) / (s_max - s_rand)

        # negative values make no sense.
        if s_eff <= 0.0:
            score = 1
        else:
            score = - math.log(s_eff)
        LOGGER.info("New score: %.5f" % score)
        return score
예제 #2
0
 def compute_best_alignment_many_to_many(self, alignment1: MultiAlignment, alignment2: MultiAlignment):
     """
     Function which finds the best alignment, by calculating alignment between two lists of sequences.
     :param alignment1: MultiAlignment object
     :param alignment2: MultiAlignment object
     :return: best_alignment, index in alignment1, index in alignment2, best_score, overall_score
     """
     best_alignment = None
     index1 = None
     index2 = None
     best_score = None
     overall_score = 0
     sequences1 = alignment1.sequences
     sequences2 = alignment2.sequences
     nw = NeedlemanWunsch(settings=self.nw_settings)
     for i, seq1 in enumerate(sequences1):
         for j, seq2 in enumerate(sequences2):
             result = nw.run(seq1, seq2)
             if best_score is None or result.score > best_score:
                 best_score = result.score
                 best_alignment = result.alignments[0]
                 index1 = i
                 index2 = j
             # the score is the addition of all pairwise scores.
             overall_score += result.score
     return [best_alignment.sequence1, best_alignment.sequence2], index1, index2, best_score, overall_score
def test_guideline_blosum():
    """Test cases given on the guideline from 04.02.2019
    """
    nw = NeedlemanWunsch()

    result, info = nw.run("data/xpgma_guideline.fasta",
                          "data/xpgma_guideline.fasta", "data/blosum62.txt",
                          False, 6, True)

    # the results is a upper triangle matrix of shape n x n.
    seq1_seq2 = result[0][1]
    assert seq1_seq2[3] == 4
    assert len(seq1_seq2[2]) == 8
    assert seq1_seq2[2][0] == (
        'ILDMDVVEGSAARFDCKVEG_YPDPEVMWFKDDNP__V_KESRHFQIDYDEEGN',
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHF_V__SQT_T')

    seq1_seq3 = result[0][2]
    assert seq1_seq3[3] == 37
    assert len(seq1_seq3[2]) == 4
    assert seq1_seq3[2][0] == (
        'ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN',
        'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPL_ASQN_RVEV__LA_')

    seq1_seq4 = result[0][3]
    assert seq1_seq4[3] == -4
    assert len(seq1_seq4[2]) == 1
    assert seq1_seq4[2][0] == (
        'ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN',
        'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD____')

    seq2_seq3 = result[1][2]
    assert seq2_seq3[3] == 3
    assert len(seq2_seq3[2]) == 1
    assert seq2_seq3[2][0] == (
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT',
        'ISDTEADIGSNLRWGC_AAAGKPRPMVRWLRNGEP__LASQNR__VEVLA')

    seq2_seq4 = result[1][3]
    assert seq2_seq4[3] == 9
    assert len(seq2_seq4[2]) == 2
    assert seq2_seq4[2][0] == (
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT',
        'RRLIPAARGGEISILCQPRA_APKATILW__SKGTEILGNSTRVTVT_SD')

    seq3_seq4 = result[2][3]
    assert seq3_seq4[3] == 24
    assert len(seq3_seq4[2]) == 1
    assert seq3_seq4[2][0] == (
        'ISDTEADIGSNLRWGCAAAGKPRPMVRWLRNGEPLASQNRVEVLA_',
        'RRLIPAARGGEISILCQPRAAPKATILWSKGTEILGNSTRVTVTSD')
def test_example_distance():
    """Test using distance scoring function"""

    nw = NeedlemanWunsch()
    result, info = nw.run("data/sequence1.fasta", "data/sequence2.fasta",
                          "data/test_scoring_distance.txt", True, 1, True)

    assert result[0][0][0].id == "idA"
    assert result[0][0][1].id == "idB"
    assert str(result[0][0][0].seq) == "TCCGA"
    assert str(result[0][0][1].seq) == "TACGCAGA"
    assert result[0][0][3] == -2
    assert len(result[0][0][2]) == 1
    assert result[0][0][2][0] == ("T_C_C_GA", "TACGCAGA")
def test_example():
    """Example testing the dummy implementation."""

    nw = NeedlemanWunsch()
    result = nw.run("data/sequence1.fa", "data/sequence2.fa",
                    "data/blosum62.txt", 5, False)
    (id_seq1, seq1, id_seq2, seq2, score, alignments) = result

    assert id_seq1 == "idA"
    assert id_seq2 == "idB"
    assert seq1 == "FancySequenceA"
    assert seq2 == "FancysequenceB"
    assert score == 1000
    assert alignments[0] == ("Fancy_SequenceA_", "Fancys_equence_B")
예제 #6
0
def test_example_invalid_characters_fail():
    """This function does a negative test: it checks if it fails when it is supposed to.
    The reason for failure is non-amino acid characters in file 2 (error code 12)"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'Invalid_characters.fasta')
    with pytest.raises(SystemExit) as InvalidCharactersException:
        result = nw.run(seq_fasta_2, seq_fasta_1, 'pam250', -8, False)
        (id_seq1, seq1, id_seq2, seq2, score, alignments,
         num_alignments) = result

        assert InvalidCharactersException.type == SystemExit
        assert InvalidCharactersException.code == 12
예제 #7
0
def test_example_invalid_format_fail():
    """This function does a negative test: it checks if it fails when it is supposed to.
    The reason for the failure is invalid file format: the first line does not start with >"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'Invalid_format.fasta')
    with pytest.raises(SystemExit) as InvalidFileException:
        result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', -8, False)
        (id_seq1, seq1, id_seq2, seq2, score, alignments,
         num_alignments) = result

        assert InvalidFileException.type == SystemExit
        assert InvalidFileException.code == 1
def test_example_similarity():
    """Test using similarity scoring function
    """

    nw = NeedlemanWunsch()

    result, info = nw.run("data/sequence1.fasta", "data/sequence2.fasta",
                          "data/test_scoring_similarity.txt", True, 1, True)

    assert result[0][0][0].id == "idA"
    assert result[0][0][1].id == "idB"
    assert str(result[0][0][0].seq) == "TCCGA"
    assert str(result[0][0][1].seq) == "TACGCAGA"
    assert result[0][0][3] == 4
    assert len(result[0][0][2]) == 6
    assert result[0][0][2][0] == ("__TCCGA_", "TACGCAGA")
예제 #9
0
def test_too_few_arguments():
    """This function does a negative test: it checks if it fails when it is supposed to.
    The reason for failure is non-amino acid characters in file 2 (error code 12)"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'seq2.fasta')
    # test is a variable which becomes True when there are too few arguments
    test = False
    try:
        with pytest.raises(SystemExit) as TooFewArguments:
            result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', False)
            (id_seq1, seq1, id_seq2, seq2, score, alignments,
             num_alignments) = result
    # A TypeError is thrown when there are too few arguments (we are missing 1 argument)
    except TypeError:
        test = True
    assert test == True
예제 #10
0
 def compute_best_alignment_one_to_many(self, leaf: Node, alignment: MultiAlignment):
     """
     Function which finds the best alignment, by calculating alignments between a sequence and many sequences.
     :param leaf: Node object which is a leaf
     :param alignment: MultiAlignment object
     :return: alignment, index of best alignment, alignment score.
     """
     assert leaf.is_leaf()
     best_alignment = None
     index = None
     best_score = None
     leaf_sequence = leaf.sequence
     sequences = alignment.sequences
     nw = NeedlemanWunsch(settings=self.nw_settings)
     for i, seq in enumerate(sequences):
         result = nw.run(leaf_sequence, seq)
         if best_score is None or result.score > best_score:
             best_score = result.score
             best_alignment = result.alignments[0]
             index = i
     return [best_alignment.sequence1, best_alignment.sequence2], index, best_score
예제 #11
0
def test_example_success():
    """This calls the run method of the Needleman-Wunsch program
    and tests if it works as expected (positive test)"""

    nw = NeedlemanWunsch()
    seq_fasta_1 = os.path.join('data', 'sequences', 'seq1.fasta')
    seq_fasta_2 = os.path.join('data', 'sequences', 'seq2.fasta')
    result = nw.run(seq_fasta_1, seq_fasta_2, 'pam250', -8, True)
    (id_seq1, seq1, id_seq2, seq2, score, alignments, num_alignments) = result
    print(alignments)

    assert id_seq1 == "ID1"
    assert id_seq2 == "ID2"
    assert seq1 == "ILDMDVVEGSAARFDCKVEGYPDPEVMWFKDDNPVKESRHFQIDYDEEGN"
    assert seq2 == "RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTDGRHFVSQTT"
    assert score == 31
    assert alignments == [[
        'ILDMDVVEGSAARFDCKVEG-YPDPEVMWFKDDNPVKESRHFQIDYDEEGN',
        'RDPVKTHEGWGVMLPCNPPAHYPGLSYRWLLNEFPNFIPTD-GRHFVSQTT',
        ':::::::**::::::*:::: **:::::*:::::*:::::: :::::::::'
    ]]
    assert num_alignments == 1
예제 #12
0
    def operation1(self, leaf1: Node, leaf2: Node) -> MultiAlignment:
        """
        Compute best pairwise alignment,
        change occurences of gap symbol to X
        :param leaf1: Node object
        :param leaf2: Node object
        :return: MultiAlignment object

        >>> from Bio.SeqRecord import SeqRecord
        >>> feng = FengDoolittle()
        >>> res = feng.operation1(leaf1=Node(sequence=SeqRecord("AAACGA"),name=None, cost=None),\
                            leaf2=Node(sequence=SeqRecord("AAA"), name=None,cost=None))
        >>> res.sequences[0].seq
        'AAACGA'
        >>> res.sequences[1].seq
        'XAAXXA'
        """
        assert leaf1.is_leaf() and leaf2.is_leaf()
        nw = NeedlemanWunsch(settings=self.nw_settings)
        result = nw.run(leaf1.sequence, leaf2.sequence)
        multi_alignment = MultiAlignment(sequences=[result.alignments[0].sequence1, result.alignments[0].sequence2],
                                         score=result.score)
        multi_alignment.sequences = replace_with_neutral_symbol(multi_alignment.sequences)
        return multi_alignment
예제 #13
0
    def run(self, seq_fasta_fn, subst_matrix_fn, is_distance_fn, cost_gap_open,
            metrict_conversion_type, clustering):
        """
            Computes a XPGMA

            Args:
              seq_fasta_fn (str): The relative path to a fasta file
              subst_matrix_fn (str): The relative path to a scoring matrix file
              is_distance_fn (bool): If True, handle scoring matrix as distance measure, else similarity measure
              cost_gap_open (int): gap cost open
              clustering (str): either "upgma" or "wpgma"

            Returns:
                new_cluster_node (Node): Root node of the XPGMA
                n
            """
        scoring_matrix = ScoringMatrix(subst_matrix_fn, is_distance_fn,
                                       cost_gap_open)
        seq_records = parse_fasta(seq_fasta_fn)
        seqs = [str(x.seq) for x in seq_records]

        # cluster distance matrix, containing pairwise distance information
        m_size = 2 * len(
            seqs) - 1  # additional len(seqs) - 1 rows when merging clusters
        m = [[0 for i in range(m_size)] for j in range(m_size)]

        # iterationlist, containing the matrix row/col indices of the current clusters
        # this is used to avoid having to clean the matrix after merge
        l = [i for i in range(len(seqs))]  # initially only singleton clusters

        # cluster distance matrix index to Node mapping
        initial_cluster = [
            Node(seq_records[i]) for i in range(len(seq_records))
        ]
        n = dict(zip(list(range(len(initial_cluster))), initial_cluster))

        # compute pairwise distances using NW
        # Note: no check if matrix is distance matrix
        nw = NeedlemanWunsch()

        result, info = nw.run(seq_fasta_fn, seq_fasta_fn, subst_matrix_fn,
                              is_distance_fn, cost_gap_open, False)

        # initialize cluster distance matrix with computed distances
        for i in range(len(seqs)):
            for j in range(i + 1, len(seqs)):
                if scoring_matrix.metric_type == MetricType.DISTANCE:
                    m[i][j] = result[i][j][3]
                elif metrict_conversion_type == 0:
                    m[i][j] = -result[i][j][3]
                elif metrict_conversion_type == 1:
                    m[i][j] = similarity_to_distance(nw, result[i][j][2][0],
                                                     scoring_matrix)
                elif metrict_conversion_type == 2:
                    m[i][j] == similarity_to_distance_ext(
                        nw, result[i][j][2][0], scoring_matrix)

        #print("m")
        #for i in range(len(seqs)):
        #    for j in range(len(seqs)):
        #        print("%3d" % (m[i][j]), end='')
        #    print()

        if clustering == "wpgma":
            return self.generate_wpgma(m, l, n)
        elif clustering == "upgma":
            return self.generate_upgma(m, l, n)