示例#1
0
def parse_alphabet(alphabet_str):
    if alphabet_str is None:
        return seq.LetterAlphabet(
            seq.ProteinSequence.alphabet.get_symbols()[:20])
    else:
        if " " in alphabet_str:
            raise InputError("Alphabet may not contain whitespaces")
        try:
            return seq.LetterAlphabet(alphabet_str)
        except Exception:
            raise InputError("Invalid alphabet")
示例#2
0
def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    if len(symbols) == 1:
        assert alph.encode(symbols[0]) == exp_code[0]
    else:
        assert list(alph.encode_multiple(symbols)) == list(exp_code)
示例#3
0
def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    code = np.array(code, dtype=np.uint8)
    if len(code) == 1:
        assert alph.decode(code[0]) == exp_symbols[0]
    else:
        assert list(alph.decode_multiple(code)) == list(exp_symbols)
示例#4
0
def plot_pb_scheme_alignment():
    random.seed(1)
    scheme_file = biotite.temp_file("json")
    mat_file = biotite.temp_file("mat")
    with open(mat_file, "w") as file:
        # PB substitution matrix, adapted from PBxplore
        file.write("""
                a     b     c     d     e     f     g     h     i     j     k     l     m     n     o     p
            a  516   -59   113  -105  -411  -177   -27  -361    47  -103  -644  -259  -599  -372  -124   -83
            b  -59   541  -146  -210  -155  -310   -97    90   182  -128   -30    29  -745  -242  -165    22
            c  113  -146   360   -14  -333  -240    49  -438  -269  -282  -688  -682  -608  -455  -147     6
            d -105  -210   -14   221     5  -131  -349  -278  -253  -173  -585  -670 -1573 -1048  -691  -497
            e -411  -155  -333     5   520   185   186   138  -378   -70  -112  -514 -1136  -469  -617  -632
            f -177  -310  -240  -131   185   459   -99   -45  -445    83  -214   -88  -547  -629  -406  -552
            g  -27   -97    49  -349   186   -99   665   -99   -89  -118  -409  -138  -124   172   128   254
            h -361    90  -438  -278   138   -45   -99   632  -205   316   192  -108  -712  -359    95  -399
            i   47   182  -269  -253  -378  -445   -89  -205   696   186     8    15  -709  -269  -169   226
            j -103  -128  -282  -173   -70    83  -118   316   186   768   196     5  -398  -340  -117  -104
            k -644   -30  -688  -585  -112  -214  -409   192     8   196   568   -65  -270  -231  -471  -382
            l -259    29  -682  -670  -514   -88  -138  -108    15     5   -65   533  -131     8   -11  -316
            m -599  -745  -608 -1573 -1136  -547  -124  -712  -709  -398  -270  -131   241    -4  -190  -155
            n -372  -242  -455 -1048  -469  -629   172  -359  -269  -340  -231     8    -4   703    88   146
            o -124  -165  -147  -691  -617  -406   128    95  -169  -117  -471   -11  -190    88   716    58
            p  -83    22     6  -497  -632  -552   254  -399   226  -104  -382  -316  -155   146    58   609
            """)
    gecli.main(args=[
        "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast",
        "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file
    ])

    colors = graphics.load_color_scheme(scheme_file)["colors"]
    fig = plt.figure(figsize=(8.0, 5.0))
    ax = fig.gca()

    pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
    fasta_file = fasta.FastaFile()
    fasta_file.read(PB_EXAMPLE_FILE_NAME)
    seq_strings = list(fasta_file.values())
    sequences = [
        seq.GeneralSequence(pb_alphabet, seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)

    graphics.plot_alignment_type_based(ax,
                                       alignment,
                                       symbols_per_line=60,
                                       spacing=2,
                                       color_scheme=colors)

    fig.tight_layout()
    return fig
示例#5
0
def test_input_types(alphabet_symbols, symbols):
    """
    'LetterAlphabet' handles different input iterable types in different
    ways.
    Assert that all ways work.
    """
    alph = seq.LetterAlphabet(alphabet_symbols)
    code = alph.encode_multiple(symbols)
    conv_symbols = alph.decode_multiple(code)

    if isinstance(symbols, bytes):
        symbols = symbols.decode("ASCII")
    assert list(conv_symbols) == list([
        symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol
        for symbol in symbols
    ])
示例#6
0
def test_match_table(use_similarity_rule):
    """
    Test the :meth:`match_table()` method based on a known example.
    
    Using the similarity rule should give the same result, as it is
    chosen to yield only the same k-mer as similar k-mer.
    """
    alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_")
    phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \
              "chuck_wood"
    phrase2 = "woodchuck"
    sequence1 = seq.GeneralSequence(alphabet, phrase1)
    sequence2 = seq.GeneralSequence(alphabet, phrase2)

    rule = _identity_rule(alphabet) if use_similarity_rule else None
    
    table1 = align.KmerTable.from_sequences(4, [sequence1])
    table2 = align.KmerTable.from_sequences(4, [sequence2])

    ref_matches = set([
        (0,  9),
        (0, 22),
        (1, 23),
        (2, 24),
        (3, 25),
        (4, 26),
        (5, 27),
        (4, 32),
        (5, 33),
        (0, 43),
        (1, 44),
        (2, 45),
        (3, 46),
        (4, 47),
        (5, 48),
        (4, 59),
        (5, 60),
        (0, 65),
    ])

    test_matches = table1.match_table(table2, similarity_rule=rule)
    # the reference indices are irrelevant for this test
    test_matches = test_matches[:, [1,3]]
    test_matches = set([tuple(match) for match in test_matches])
    assert test_matches == ref_matches
示例#7
0
def test_load_color_scheme(scheme_path):
    from matplotlib.colors import to_rgb
    import biotite.sequence.graphics as graphics

    supported_alphabets = [
        seq.NucleotideSequence.alphabet_amb,
        seq.ProteinSequence.alphabet,
        seq.LetterAlphabet("abcdefghijklmnop")  # Protein block alphabet
    ]

    test_scheme = graphics.load_color_scheme(scheme_path)

    assert test_scheme["alphabet"] in supported_alphabets
    assert len(test_scheme["colors"]) == len(test_scheme["alphabet"])
    for color in test_scheme["colors"]:
        if color is not None:
            # Should not raise error
            to_rgb(color)
示例#8
0
def test_error(alphabet_symbols, use_letter_alphabet, is_single_val):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    if is_single_val:
        with pytest.raises(seq.AlphabetError):
            alph.encode("G")
        with pytest.raises(seq.AlphabetError):
            alph.encode(42)
        with pytest.raises(seq.AlphabetError):
            alph.decode(len(alphabet_symbols))
        with pytest.raises(seq.AlphabetError):
            alph.decode(-1)
    else:
        with pytest.raises(seq.AlphabetError):
            alph.encode_multiple("G")
        with pytest.raises(seq.AlphabetError):
            alph.encode_multiple([42])
        with pytest.raises(seq.AlphabetError):
            alph.decode_multiple(np.array([len(alphabet_symbols)]))
        with pytest.raises(seq.AlphabetError):
            alph.decode_multiple(np.array([-1]))
示例#9
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.structure as struc
import biotite.structure.io.mmtf as mmtf
import biotite.database.rcsb as rcsb

# PB alphabet
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")

# PB substitution matrix, adapted from PBxplore
matrix_str = """
    a     b     c     d     e     f     g     h     i     j     k     l     m     n     o     p 
a  516   -59   113  -105  -411  -177   -27  -361    47  -103  -644  -259  -599  -372  -124   -83
b  -59   541  -146  -210  -155  -310   -97    90   182  -128   -30    29  -745  -242  -165    22
c  113  -146   360   -14  -333  -240    49  -438  -269  -282  -688  -682  -608  -455  -147     6  
d -105  -210   -14   221     5  -131  -349  -278  -253  -173  -585  -670 -1573 -1048  -691  -497
e -411  -155  -333     5   520   185   186   138  -378   -70  -112  -514 -1136  -469  -617  -632
f -177  -310  -240  -131   185   459   -99   -45  -445    83  -214   -88  -547  -629  -406  -552
g  -27   -97    49  -349   186   -99   665   -99   -89  -118  -409  -138  -124   172   128   254
h -361    90  -438  -278   138   -45   -99   632  -205   316   192  -108  -712  -359    95  -399
i   47   182  -269  -253  -378  -445   -89  -205   696   186     8    15  -709  -269  -169   226
j -103  -128  -282  -173   -70    83  -118   316   186   768   196     5  -398  -340  -117  -104
k -644   -30  -688  -585  -112  -214  -409   192     8   196   568   -65  -270  -231  -471  -382
示例#10
0
def test_contains(alphabet_symbols, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)
    assert "D" in alph
示例#11
0
def test_length(alphabet_symbols, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)
    assert len(alph) == len(alphabet_symbols)