Exemplo n.º 1
0
def parse_matrix(matrix_str, alphabet):
    if isfile(matrix_str):
        with open(matrix_str) as f:
            matrix_dict = align.SubstitutionMatrix.dict_from_str(f.read())
            return align.SubstitutionMatrix(alphabet, alphabet, matrix_dict)
    else:
        # String is a NCBI matrix name
        upper_matrix_str = matrix_str.upper()
        if upper_matrix_str not in align.SubstitutionMatrix.list_db():
            raise InputError(
                f"'{matrix_str}' is neither a file "
                f"nor a valid NCBI substitution matrix"
            )
        return align.SubstitutionMatrix(alphabet, alphabet, upper_matrix_str)
Exemplo n.º 2
0
def test_matrix_str():
    alph1 = seq.Alphabet("abc")
    alph2 = seq.Alphabet("def")
    score_matrix = np.arange(9).reshape((3, 3))
    matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
    assert str(matrix) == "\n".join(
        ["    d   e   f", "a   0   1   2", "b   3   4   5", "c   6   7   8"])
Exemplo n.º 3
0
def test_custom_sequence_type(app_cls):
    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    exp_trace = [
        [0, 0],
        [1, -1],
        [2, 1],
        [3, 2],
        [-1, 3],
        [4, 4],
        [5, 5],
        [6, 6],
    ]
    # Strong identity matrix
    score_matrix = np.identity(len(alph))
    score_matrix[score_matrix == 0] = -1000
    score_matrix[score_matrix == 1] = 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    app = app_cls(sequences, matrix=matrix)
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert alignment.sequences == sequences
    assert alignment.trace.tolist() == exp_trace
Exemplo n.º 4
0
def test_matrices(db_entry):
    """
    Test reading of matrix files.
    """
    alph1 = seq.ProteinSequence.alphabet
    alph2 = seq.ProteinSequence.alphabet
    matrix = align.SubstitutionMatrix(alph1, alph2, db_entry)
Exemplo n.º 5
0
def matrices(name):
    """
    A SubstitutionMatrix maps each possible pairing of a symbol of a first alphabet with
    a symbol of a second alphabet to a score (int)

    Parameters
    ----------
    name: string
        Name of the matrix which is loaded from the internal matrix database.
        If the name of Substitution Matrix could not be found, the default SubstitutionMatrix
        will be BLOSUM62.

    Returns
    -------
    SubstitutionMatrix
        The class uses a 2-D (m x n) ndarray, where each element stores the score
        for a symbol pairing, indexed by the symbol codes of the respective symbols
        in an m-length alphabet 1 and an n-length alphabet 2

    """
    if name == "BLOSUM62":
        matrix = seq_align.SubstitutionMatrix.std_protein_matrix()
    else:
        alph = seq.ProteinSequence.alphabet
        matrix = seq_align.SubstitutionMatrix(alph, alph, name)

    return matrix
Exemplo n.º 6
0
def test_custom_substitution_matrix(sequences, app_cls):
    alph = seq.ProteinSequence.alphabet
    # Strong identity matrix
    score_matrix = np.identity(len(alph)) * 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    exp_ali = ("BI-QTITE\n" "TITANITE\n" "BI-SMITE\n" "-I-QLITE")
    app = app_cls(sequences, matrix=matrix)
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert str(alignment) == exp_ali
Exemplo n.º 7
0
def parse_matrix(matrix_str, alphabet):
    if isfile(matrix_str):
        with open(matrix_str) as f:
            matrix_dict = align.SubstitutionMatrix.dict_from_str(f.read())
            return align.SubstitutionMatrix(alphabet, alphabet, matrix_dict)
    else:
        # String is a NCBI matrix name
        # For user convenience there is no case sensitivity
        # -> Find fitting matrix
        matrix_list = align.SubstitutionMatrix.list_db()
        upper_matrix_str = matrix_str.upper()
        upper_matrix_list = [
            m.upper() for m in align.SubstitutionMatrix.list_db()
        ]
        try:
            matrix_str = matrix_list[upper_matrix_list.index(upper_matrix_str)]
        except:
            raise InputError(f"'{matrix_str}' is neither a file "
                             f"nor a valid NCBI substitution matrix")
        return align.SubstitutionMatrix(alphabet, alphabet, matrix_str)
Exemplo n.º 8
0
def test_invalid_scoring_scheme():
    """
    Check if `from_samples()` raises an exception when the expected
    similarity score between to random symbols is positive.
    """
    alph = seq.ProteinSequence.alphabet
    matrix = align.SubstitutionMatrix(
        alph, alph, np.ones((len(alph), len(alph)), dtype=int))
    # Uniform background frequencies
    freq = np.ones(len(alph))

    with pytest.raises(ValueError):
        estimator = EValueEstimator.from_samples(alph, matrix, -10, freq)
Exemplo n.º 9
0
def test_score_scaling(sequences):
    """
    Scaling the substitution scores and gap penalties by a constant
    factor should not influence the obtained E-values.
    Test this by aligning real sequences with a standard and scaled
    scoring scheme and comparing the calculated E-values of these
    alignments.
    """
    SCALING_FACTOR = 1000
    GAP_PENALTY = (-12, -1)
    SEQ_LENGTH = 300

    matrix = align.SubstitutionMatrix.std_protein_matrix()

    np.random.seed(0)
    std_estimator = align.EValueEstimator.from_samples(
        seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND)
    scores = [
        align.align_optimal(sequences[i],
                            sequences[i + 1],
                            matrix,
                            GAP_PENALTY,
                            local=True,
                            max_number=1)[0].score for i in range(9)
    ]
    std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH)

    scaled_matrix = align.SubstitutionMatrix(
        seq.ProteinSequence.alphabet, seq.ProteinSequence.alphabet,
        matrix.score_matrix() * SCALING_FACTOR)
    scaled_gap_penalty = (GAP_PENALTY[0] * SCALING_FACTOR,
                          GAP_PENALTY[1] * SCALING_FACTOR)
    scaled_estimator = align.EValueEstimator.from_samples(
        seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty,
        BACKGROUND)
    scores = [
        align.align_optimal(sequences[i],
                            sequences[i + 1],
                            scaled_matrix,
                            scaled_gap_penalty,
                            local=True,
                            max_number=1)[0].score for i in range(9)
    ]
    scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH,
                                                     SEQ_LENGTH)

    # Due to relatively low sample size, expect rather large deviation
    assert std_log_evalues.tolist() \
        == pytest.approx(scaled_log_evalues.tolist(), rel=0.2)
Exemplo n.º 10
0
def test_matrix_str():
    """
    Test conversion of substitution matrix to string via a small
    constructed test case.
    """
    alph1 = seq.Alphabet("abc")
    alph2 = seq.Alphabet("def")
    score_matrix = np.arange(9).reshape((3,3))
    matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix)
    assert str(matrix) == "\n".join(
        ["    d   e   f",
         "a   0   1   2",
         "b   3   4   5",
         "c   6   7   8"]
    )
Exemplo n.º 11
0
def test_custom_substitution_matrix(sequences, app_cls):
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.ProteinSequence.alphabet
    # Strong identity matrix
    score_matrix = np.identity(len(alph)) * 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    exp_ali = ("BI-QTITE\n" "TITANITE\n" "BI-SMITE\n" "-I-QLITE")
    try:
        app = app_cls(sequences, matrix=matrix)
    except VersionError:
        pytest.skip(f"Invalid software version")
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert str(alignment) == exp_ali
Exemplo n.º 12
0
def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k):
    """
    Check if `EValueEstimator` estimates the extreme value distribution
    parameters correctly by comparing them to the parameters described
    in the original publication by Altschul *et al*.
    """
    SAMPLE_LENGTH = 500
    SAMPLE_SIZE = 1000

    alphabet = seq.ProteinSequence.alphabet
    matrix = align.SubstitutionMatrix(alphabet, alphabet, matrix_name)

    np.random.seed(0)
    estimator = align.EValueEstimator.from_samples(alphabet, matrix,
                                                   gap_penalty, BACKGROUND,
                                                   SAMPLE_LENGTH, SAMPLE_SIZE)

    # Due to relatively low sample size, expect rather large deviation
    assert estimator.lam == pytest.approx(ref_lam, rel=0.1)
    assert estimator.k == pytest.approx(ref_k, rel=0.6)
Exemplo n.º 13
0
def _convert_to_uint16_code(seq1, seq2, matrix):
        """
        Adjust sequences, so that they use 'uint16' as dtype for the
        code.
        This is a necessary test, since 'uint8' uses a separate
        implementation.
        """
        new_alph = seq.Alphabet(np.arange(500))
        code = seq1.code
        seq1 = seq.GeneralSequence(new_alph)
        seq1.code = code
        code = seq2.code
        seq2 = seq.GeneralSequence(new_alph)
        seq2.code = code
        # Adjust the substitution matrix as well,
        # so that it is compatible with the new alphabet
        score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32)
        orig_len = len(matrix.score_matrix())
        score_matrix[:orig_len, :orig_len] = matrix.score_matrix()
        matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix)
        return seq1, seq2, matrix
Exemplo n.º 14
0
def test_custom_sequence_type(app_cls):
    bin_path = BIN_PATH[app_cls]
    if is_not_installed(bin_path):
        pytest.skip(f"'{bin_path}' is not installed")

    alph = seq.Alphabet(("foo", "bar", 42))
    sequences = [
        seq.GeneralSequence(alph, sequence) for sequence in [
            ["foo", "bar", 42, "foo", "foo", 42, 42],
            ["foo", 42, "foo", "bar", "foo", 42, 42],
        ]
    ]
    exp_trace = [
        [0, 0],
        [1, -1],
        [2, 1],
        [3, 2],
        [-1, 3],
        [4, 4],
        [5, 5],
        [6, 6],
    ]
    # Strong identity matrix
    score_matrix = np.identity(len(alph))
    score_matrix[score_matrix == 0] = -1000
    score_matrix[score_matrix == 1] = 1000
    matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
    try:
        app = app_cls(sequences, matrix=matrix)
    except VersionError:
        pytest.skip(f"Invalid software version")
    app.start()
    app.join()
    alignment = app.get_alignment()
    assert alignment.sequences == sequences
    assert alignment.trace.tolist() == exp_trace
Exemplo n.º 15
0
def test_matrices(db_entry):
    alph1 = seq.ProteinSequence.alphabet
    alph2 = seq.ProteinSequence.alphabet
    matrix = align.SubstitutionMatrix(alph1, alph2, db_entry)
Exemplo n.º 16
0
import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics

# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print(matrix)

########################################################################
# The original *BLOSUM62* contains symbols for ambiguous amino acids and
# the stop signal.
# As these are not actual amino acids, a new substitution matrix is
# created, where these symbols are are removed.

# Matrix should not contain ambiguous symbols or stop signal
matrix = align.SubstitutionMatrix(
    seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
    seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
    matrix.score_matrix()[:-4, :-4])
similarities = matrix.score_matrix()
print(matrix)


########################################################################
# Now a function must be defined, that converts the similarity depicted
# by a substitution matrix into a distance required by the UPGMA method.
# In this case, the distance is defined as the difference between the
# similarity of the two symbols and the average maximum similarity of
# the symbols to themselves.
#
# Finally the obtained (phylogenetic) tree is plotted as dendrogram.
def get_distance(similarities, i, j):
    s_max = (similarities[i, i] + similarities[j, j]) / 2
Exemplo n.º 17
0
# So much for theory.
# Let's start by showing different ways to construct a
# :class:`SubstitutionMatrix`, in our case for protein sequence
# alignments:

import biotite.sequence as seq
import biotite.sequence.align as align
import numpy as np

alph = seq.ProteinSequence.alphabet
# Load the standard protein substitution matrix, which is BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print("\nBLOSUM62\n")
print(matrix)
# Load another matrix from internal database
matrix = align.SubstitutionMatrix(alph, alph, "BLOSUM50")
# Load a matrix dictionary representation,
# modify it, and create the SubstitutionMatrix
# (Dictionary could be loaded from matrix string in NCBI format, too)
matrix_dict = align.SubstitutionMatrix.dict_from_db("BLOSUM62")
matrix_dict[("P", "Y")] = 100
matrix = align.SubstitutionMatrix(alph, alph, matrix_dict)
# And now create a matrix by directly provding the ndarray
# containing the similarity scores
# (identity matrix in our case)
scores = np.identity(len(alph), dtype=int)
matrix = align.SubstitutionMatrix(alph, alph, scores)
print("\n\nIdentity matrix\n")
print(matrix)

########################################################################
Exemplo n.º 18
0
def simple_matrix():
    alph = seq.NucleotideSequence.alphabet_unamb
    return align.SubstitutionMatrix(
        alph, alph,
        np.array([[1, -1, -1, -1], [-1, 1, -1, -1], [-1, -1, 1, -1],
                  [-1, -1, -1, 1]]))
Exemplo n.º 19
0
def _identity_rule(alphabet):
    score_matrix = np.full((len(alphabet),) * 2, -1, dtype=int)
    np.fill_diagonal(score_matrix, 0)
    matrix = align.SubstitutionMatrix(alphabet, alphabet, score_matrix)
    rule = align.ScoreThresholdRule(matrix, 0)
    return rule
Exemplo n.º 20
0
# finally the sequences are mapped back into the original sequence type.
# Let's show this on the example of a nonsense alphabet.

import numpy as np
import biotite.application.mafft as mafft
import biotite.sequence.align as align

alphabet = seq.Alphabet(("foo", "bar", 42))
sequences = [
    seq.GeneralSequence(alphabet, sequence) for sequence in [
        ["foo", "bar", 42, "foo", "foo", 42, 42],
        ["foo", 42, "foo", "bar", "foo", 42, 42],
    ]
]
matrix = align.SubstitutionMatrix(
    alphabet, alphabet,
    np.array([[100, -100, -100], [-100, 100, -100], [-100, -100, 100]]))
alignment = mafft.MafftApp.align(sequences, matrix=matrix)
# As the alphabet do not has characters as symbols
# the alignment cannot be directly printed
# However, we can print the trace
print(alignment.trace)

########################################################################
# Secondary structure annotation
# ------------------------------
#
# .. currentmodule:: biotite.application.dssp
#
# Althogh :mod:`biotite.structure` offers the function
# :func:`annotate_sse()` to assign secondary structure elements based on
Exemplo n.º 21
0
    rmsda = np.sum(
        ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 -
         180)**2,
        axis=-1)
    # Chose PB, where the RMSDA to the reference angle is lowest
    # Due to the definition of Biotite symbol codes
    # the index of the chosen PB is directly the symbol code
    pb_seq_code = np.argmin(rmsda, axis=0)
    # Put the array of symbol codes into actual sequence objects
    pb_sequence = seq.GeneralSequence(pb_alphabet)
    pb_sequence.code = pb_seq_code
    pb_seqs.append(pb_sequence)

# Perfrom a multiple sequence alignment of the PB sequences
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(pb_seqs,
                                              matrix,
                                              gap_penalty=(-500, -100),
                                              terminal_penalty=False)

# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(ax,
                                   alignment,
                                   labels=labels,