示例#1
0
def test_simple_alignment(gap_penalty, local, band_width):
    """
    Test `align_banded()` by comparing the output to `align_optimal()`.
    This test uses a pair of highly similar short sequences.
    """
    # Cyclotide C, Uniprot: P86843
    seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld")
    # Cyclotide F, Uniprot: P86846
    seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld")
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    ref_alignments = align.align_optimal(
        seq1, seq2, matrix,
        gap_penalty=gap_penalty, local=local, terminal_penalty=False
    )
    # Remove terminal gaps in reference to obtain a true semi-global
    # alignment, as returned by align_banded()
    ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments]
    
    test_alignments = align.align_banded(
        seq1, seq2, matrix, (-band_width, band_width),
        gap_penalty=gap_penalty, local=local
    )

    assert len(test_alignments) == len(ref_alignments)
    for alignment in test_alignments:
        assert alignment in ref_alignments
示例#2
0
def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold):
    """
    Test if the similar k-mers given by :class:`ScoreThresholdRule`
    are equal to k-mers generated by a brute-force approach.
    """
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    
    ref_kmer_sequence = seq.ProteinSequence()
    ref_kmer_sequence.code = kmer_alphabet.split(ref_kmer)
    
    ref_sim_kmer_set = set()
    # Iterate through all possible k-mers 
    for kmer in range(len(kmer_alphabet)):
        kmer_sequence = seq.ProteinSequence()
        kmer_sequence.code = kmer_alphabet.split(kmer)
        score = align.align_ungapped(
            ref_kmer_sequence, kmer_sequence, matrix, score_only=True
        )
        # Add k-mer to list if the threshold score is reached
        if score >= threshold:
            ref_sim_kmer_set.add(kmer)
    
    test_rule = align.ScoreThresholdRule(matrix, threshold)
    test_sim_kmer_set = set(test_rule.similar_kmers(kmer_alphabet, ref_kmer))

    assert test_sim_kmer_set == ref_sim_kmer_set
示例#3
0
    def get_alignment(cls, seq1: str, seq2: str, local: bool = True):
        """
        Generate an alignment between two sequences

        Parameters
        ----------
        seq1: str
            The first sequence to be aligned
        seq1: str
            The second sequence to be aligned
        local: bool
            If false, a global alignment is performed
            (based on the Needleman-Wunsch algorithm),
            otherwise a local alignment is performed
            (based on the Smith–Waterman algorithm).
            (Default: True)

        Returns
        -------
        Alignment
        """

        import biotite.sequence as seq
        import biotite.sequence.align as align
        import numpy as np

        # create the default matrix
        # TODO add more options for the choice of matrix
        matrix = align.SubstitutionMatrix.std_protein_matrix()

        alignments = align.align_optimal(
            seq.ProteinSequence(seq1),
            seq.ProteinSequence(seq2),
            matrix,
            local=local,
        )

        alignment = alignments[0]

        score = alignment.score
        seq_identity = align.get_sequence_identity(alignment)
        symbols = align.get_symbols(alignment)
        codes = align.get_codes(alignment)

        return cls(
            alignment=alignment,
            metadata={
                "score": score,
                "sequence_identity": seq_identity,
                "symbols": symbols,
                "codes": codes,
            },
        )
示例#4
0
def test_sequence_conversion():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    # Cannot compare dicts directly, since the original RNA sequence is
    # now guessed as protein sequence
    for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
        assert str(seq1) == str(seq2)

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    # Expect a warning for selenocysteine conversion
    with pytest.warns(UserWarning):
        assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
示例#5
0
def test_sequence_conversion():
    path = os.path.join(data_dir, "nuc.fasta")
    file = fasta.FastaFile()
    file.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir, "prot.fasta")
    file4 = fasta.FastaFile()
    file4.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir, "invalid.fasta")
    file5 = fasta.FastaFile()
    file5.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
示例#6
0
def sequences():
    """
    10 Cas9 sequences.
    """
    fasta_file = fasta.FastaFile()
    fasta_file.read(join(data_dir, "cas9.fasta"))
    return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
示例#7
0
def test_to_consensus_prot():
    # Avidin protein sequence
    seq1 = seq.ProteinSequence(
        "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP"
        "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE")
    # Streptavidin protein sequence
    seq2 = seq.ProteinSequence(
        "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA"
        "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN"
        "GNPLDAVQQ")
    matrix = align.SubstitutionMatrix.std_protein_matrix()
    alignment = align.align_optimal(seq1, seq2, matrix)[0]

    profile = seq.SequenceProfile.from_alignment(alignment)
    assert seq.ProteinSequence(
        "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD"
        "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG"
        "INIFNPLDAQKE") == profile.to_consensus()
示例#8
0
def _create_random_pair(seed, length=100, max_subsitutions=5,
                        max_insertions=5, max_deletions=5,
                        max_truncations=5):
    """
    generate a pair of protein sequences.
    Each pair contains

        1. a randomly generated sequence
        2. a sequence created by randomly introducing
           deletions/insertions/substitutions into the first sequence.
    """
    np.random.seed(seed)

    original = seq.ProteinSequence()
    original.code = np.random.randint(len(original.alphabet), size=length)

    mutant = original.copy()

    # Random Substitutions
    n_subsitutions = np.random.randint(max_subsitutions)
    subsitution_indices = np.random.choice(
        np.arange(len(mutant)), size=n_subsitutions, replace=False
    )
    subsitution_values = np.random.randint(
        len(original.alphabet), size=n_subsitutions
    )
    mutant.code[subsitution_indices] = subsitution_values

    # Random insertions
    n_insertions = np.random.randint(max_insertions)
    insertion_indices = np.random.choice(
        np.arange(len(mutant)), size=n_insertions, replace=False
    )
    insertion_values = np.random.randint(
        len(original.alphabet), size=n_insertions
    )
    mutant.code = np.insert(mutant.code, insertion_indices, insertion_values)

    # Random deletions
    n_deletions = np.random.randint(max_deletions)
    deletion_indices = np.random.choice(
        np.arange(len(mutant)), size=n_deletions, replace=False
    )
    mutant.code = np.delete(mutant.code, deletion_indices)

    # Truncate at both ends of original and mutant
    original = original[
        np.random.randint(max_truncations) :
        -(1 + np.random.randint(max_truncations))
    ]
    mutant = mutant[
        np.random.randint(max_truncations) :
        -(1 + np.random.randint(max_truncations))
    ]

    return original, mutant
示例#9
0
def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein):
    """
    Test whether the molecular weight of a protein is calculated
    correctly.
    """
    protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW")
    mol_weight_protein = protein.get_molecular_weight(
        monoisotopic=monoisotopic)
    assert mol_weight_protein == \
           pytest.approx(expected_mol_weight_protein, abs=1e-2)
示例#10
0
def test_evalue():
    """
    Check if the estimated E-values for a given score approximately
    match the number of random sequences with equal or better score via
    sampling.
    Low scores that lead to a rather high E-value are required to get
    a reasonable accuracy.
    """
    TEST_SCORES = [30, 40, 50]
    GAP_PENALTY = (-12, -1)
    N_SAMPLES = 10000
    SEQ_LENGTH = 300

    matrix = align.SubstitutionMatrix.std_protein_matrix()
    estimator = align.EValueEstimator.from_samples(
        seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND)

    # Generate large number of alignments of random sequences
    np.random.seed(0)
    random_sequence_code = np.random.choice(len(seq.ProteinSequence.alphabet),
                                            size=(N_SAMPLES, 2, SEQ_LENGTH),
                                            p=BACKGROUND)
    sample_scores = np.zeros(N_SAMPLES, dtype=int)
    for i in range(N_SAMPLES):
        seq1 = seq.ProteinSequence()
        seq2 = seq.ProteinSequence()
        seq1.code = random_sequence_code[i, 0]
        seq2.code = random_sequence_code[i, 1]
        sample_scores[i] = align.align_optimal(seq1,
                                               seq2,
                                               matrix,
                                               local=True,
                                               gap_penalty=GAP_PENALTY,
                                               max_number=1)[0].score

    e_values = [
        10**estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES)
        for score in TEST_SCORES
    ]
    counts = [
        np.count_nonzero(sample_scores >= score) for score in TEST_SCORES
    ]
    assert e_values == pytest.approx(counts, rel=0.5)
示例#11
0
def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only):
    """
    Test `align_local_gapped()` by comparing the output to
    `align_optimal()`.
    This test uses a pair of highly similar short sequences.
    """
    # Cyclotide C, Uniprot: P86843
    seq1 = seq.ProteinSequence("gvpcaescvwipctvtallgcsckdkvcyld")
    # Cyclotide F, Uniprot: P86846
    seq2 = seq.ProteinSequence("gipcgescvfipcissvvgcsckskvcyld")
    matrix = align.SubstitutionMatrix.std_protein_matrix()

    ref_alignments = align.align_optimal(seq1,
                                         seq2,
                                         matrix,
                                         gap_penalty=gap_penalty,
                                         local=True)
    # Limit reference alignment range to seed
    # if the alignment does not extend in both directions
    for alignment in ref_alignments:
        seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0]
        if direction == "upstream":
            alignment.trace = alignment.trace[:seed_index + 1]
        elif direction == "downstream":
            alignment.trace = alignment.trace[seed_index:]
        alignment.score = align.score(alignment, matrix, gap_penalty)

    test_result = align.align_local_gapped(seq1, seq2, matrix, seed, threshold,
                                           gap_penalty, 1000, direction,
                                           score_only)

    if score_only:
        test_score = test_result
        # All optimal alignments have the same score
        assert test_score == ref_alignments[0].score
    else:
        test_alignments = test_result
        assert len(test_alignments) == len(ref_alignments)
        for alignment in test_alignments:
            assert alignment in ref_alignments
示例#12
0
def sequence_alignment(seq1: str,
                       seq2: str,
                       matrix: str,
                       gap: int,
                       local: bool = False) -> str:
    """
    Perform a global alignment, based on the Needleman-Wunsch algorithm

    Parameters
    ----------
    seq1,seq2: str
        The sequences to be aligned

    matrix: SubstitutionMatrix
        The substitution matrix used for scoring

    gap: int or (tuple, dtype=int)
         Int the value will be interpreted as general gap penalty.
         Tupel is provided, an affine gap penalty is used. The first integer in the tuple is the gap opening penalty,
         the second integer is the gap extension penalty. The values need to be negative.

    local : bool, optional, default=False
        Whether to use local alignment (Smith-Waterman) or global (Needleman-Wunsch)

    Returns
    -------
    str
        An optimal alignment of two sequences
    """

    matrix = matrices(matrix)
    alignment = seq_align.align_optimal(
        seq.ProteinSequence(seq1),
        seq.ProteinSequence(seq2),
        matrix,
        gap_penalty=gap,
        local=local,
    )
    return alignment[0]
示例#13
0
def test_identity():
    seq_str1 = "--HAKLPRDD--WL--"
    seq_str2 = "FRHA--QRTDADWLHH"
    seq_strings = [seq_str1, seq_str2]
    sequences = [
        seq.ProteinSequence(seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)
    # Assert correct sequence identity calculation
    modes = ["all", "not_terminal", "shortest"]
    values = [6 / 16, 6 / 12, 6 / 10]
    for mode, value in zip(modes, values):
        assert align.get_sequence_identity(alignment, mode=mode) == value
示例#14
0
def create_consensus(sequences):
    seq_len = len(sequences[0])
    consensus_code = np.zeros(seq_len, dtype=int)
    for seq_pos in range(seq_len):
        # Count the number of occurrences of each amino acid
        # at the given sequence position
        counts = np.bincount(
            [sequence.code[seq_pos] for sequence in sequences])
        # The consensus amino acid is the most frequent amino acid
        consensus_code[seq_pos] = np.argmax(counts)
    # Create empty ProteinSequence object...
    consensus_sequence = seq.ProteinSequence()
    # ...and fill it with the sequence code containing the consensus
    # sequence
    consensus_sequence.code = consensus_code
    return consensus_sequence
示例#15
0
def test_conversion_to_symbols():
    """
    Test conversion of alignments to strings.
    """
    seq_str1 = "HAKLPRDD--WKL--"
    seq_str2 = "HA--PRDDADWKLHH"
    seq_str3 = "HA----DDADWKLHH"
    seq_strings = [seq_str1, seq_str2, seq_str3]
    sequences = [seq.ProteinSequence(seq_str.replace("-",""))
                 for seq_str in seq_strings]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)
    # Test the conversion bach to strings of symbols
    symbols = align.get_symbols(alignment)
    symbols = ["".join([sym if sym is not None else "-" for sym in sym_list])
               for sym_list in symbols]
    assert symbols == seq_strings
示例#16
0
def test_protein(use_custom_matrix):
    """
    Test masking a protein sequence based on a known example.
    """
    seq_string = "MAPKINASekinasekinase"
    sequence = seq.ProteinSequence(seq_string)

    if use_custom_matrix:
        matrix = align.SubstitutionMatrix.std_protein_matrix()
    else:
        matrix = None

    test_mask = TantanApp.mask_repeats(sequence, matrix)

    ref_mask = [True if char.islower() else False for char in seq_string]

    assert len(test_mask) == len(ref_mask)
    assert np.all(test_mask.tolist() == ref_mask)
示例#17
0
def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed):
    """
    Test :meth:`create_kmers()` for creating spaced *k-mers*.
    Compare results from random sequences to corresponding results from
    :meth:`create_kmers()` without spacing, by using a spacing model
    that is equivalent to non-spaced *k-mers*.
    """
    MIN_LENGTH = 10
    MAX_LENGTH = 1000
    np.random.seed(seed)
    sequence = seq.ProteinSequence()
    sequence.code = np.random.randint(len(sequence.alphabet),
                                      size=np.random.randint(
                                          MIN_LENGTH, MAX_LENGTH))

    ref_kmers = kmer_alphabet.create_kmers(sequence.code)
    test_kmers = spaced_kmer_alphabet.create_kmers(sequence.code)

    assert len(test_kmers) == len(ref_kmers)
    assert test_kmers.tolist() == ref_kmers.tolist()
示例#18
0
    mol = info.residue(residue)
    thetas = np.linspace(-30, 30, 60)
    thetas = np.append(thetas, np.linspace(30, 0, 30))
    for i, theta in enumerate(thetas):
        mol_new = rotate_residue(mol, 0, theta * np.pi / 180)
        plot(mol_new, save_as=f"./plots/res_flex/{i}.png", show=False)

    thetas = np.linspace(0, 30, 60)
    thetas = np.append(thetas, np.linspace(30, -30, 30))
    for j, theta in enumerate(thetas):
        mol_new = rotate_residue(mol, 1, theta * np.pi / 180)
        plot(mol_new, save_as=f"./plots/res_flex/{i+j}.png", show=False)


# --- create directory ---
pth = f"./data/psi4files/peptides"
if not os.path.exists(pth):
    os.makedirs(pth)

n_res = [5, 10, 20, 30, 50, 100]
for i in range(len(n_res)):
    seq_str = ""
    for j in range(n_res[i]):
        seq_str = seq_str + "T"
    sequence = seq.ProteinSequence(seq_str)
    peptide = assemble_peptide(sequence)
    pep_string = mkpsi4(peptide)
    f = open(f"{pth}/{seq_str}.txt", "w")
    f.write(pep_string)
    f.close()
示例#19
0
            # Gene is surrounded by square brackets
            gene = line[gene_start : gene_end+1] \
                   .replace("[","").replace("]","")
            # Sometimes alternative gene names are separated via a
            # semicolon -> Choose the first gene name
            gene = gene.split(";")[0].strip()
            genes.append(gene)
            ids.append(ncbi_id)

# Download sequences a file-like object and read the sequences from it
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(ids,
                             file_name=None,
                             db_name="protein",
                             ret_type="fasta"))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
# Create multiple sequence alignment with Clustal Omega
alignment = clustalo.ClustalOmegaApp.align(sequences)

# The distance measure required for the tree calculation is the
# percentage of non-identical amino acids in the respective two
# sequences
distances = 1 - align.get_pairwise_sequence_identity(alignment,
                                                     mode="shortest")
# Create tree via neighbor joining
tree = phylo.neighbor_joining(distances)
# Convert to NetworkX graph
#For the graph visualization, the edge directions are unnecessary
graph = tree.as_graph().to_undirected()

fig = plt.figure(figsize=(8.0, 8.0))
示例#20
0
import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq,
                                 streptavidin_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
                                 terminal_penalty=False)
# Draw first and only alignment
# The color intensity indicates the similiarity
fig = plt.figure(figsize=(8.0, 2.5))
ax = fig.add_subplot(111)
示例#21
0
import biotite.database.entrez as entrez

# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta"))

ids = []
sequences = []
for header, seq_str in fasta_file.items():
    # Extract the UniProt Entry name from header
    identifier = header.split("|")[-1].split()[0]
    ids.append(identifier)
    sequences.append(seq.ProteinSequence(seq_str))

matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
    sequences, matrix, gap_penalty=(-10, -1), terminal_penalty=False)
# Order alignment according to the guide tree
alignment = alignment[:, order]
ids = [ids[i] for i in order]

fig = plt.figure(figsize=(8.0, 20.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_type_based(ax,
                                   alignment,
                                   labels=ids,
                                   show_numbers=True,
                                   spacing=2.0)
示例#22
0
# :func:`NucleotideSequence.complement()` method.

# Lower case characters are automatically capitalized
seq1 = seq.NucleotideSequence("tacagtt")
print("Original:", seq1)
seq2 = seq1.reverse().complement()
print("Reverse complement:", seq2)

########################################################################
# The other :class:`Sequence` type is :class:`ProteinSequence`.
# It supports the letters for the 20 standard amino acids plus some
# letters for ambiguous amino acids and a letter for a stop signal.
# Furthermore, this class provides some utilities like
# 3-letter to 1-letter translation (and vice versa).

prot_seq = seq.ProteinSequence("BIQTITE")
print("-".join(
    [seq.ProteinSequence.convert_letter_1to3(symbol) for symbol in prot_seq]))

########################################################################
# A :class:`NucleotideSequence` can be translated into a
# :class:`ProteinSequence` via the
# :func:`NucleotideSequence.translate()` method.
# By default, the method searches for open reading frames (ORFs) in the
# 3 frames of the sequence.
# A 6 frame ORF search requires an
# additional call of :func:`NucleotideSequence.translate()` with the
# reverse complement of the sequence.
# If you want to conduct a complete translation of the sequence,
# irrespective of any start and stop codons, set the parameter
# :obj:`complete` to true.
示例#23
0
def sequences():
    return [
        seq.ProteinSequence(string)
        for string in ["BIQTITE", "TITANITE", "BISMITE", "IQLITE"]
    ]
示例#24
0
def test_stop_removal():
    string = "LYG*GR*"
    protein = seq.ProteinSequence(string)
    assert str(protein.remove_stops()) == string.replace("*", "")
示例#25
0
# :class:`Application` classes in depth.
#
# Finding homologous sequences with BLAST
# ---------------------------------------
#
# .. currentmodule:: biotite.application.blast
#
# the :mod:`biotite.application.blast` subpackage provides an
# interface to NCBI BLAST: the :class:`BlastWebApp` class.
# Let's dive directly into the code, we try to find
# homologous sequences to the miniprotein *TC5b*:

import biotite.application.blast as blast
import biotite.sequence as seq

tc5b_seq = seq.ProteinSequence("NLYIQWLKDGGPSSGRPPPS")
app = blast.BlastWebApp("blastp", tc5b_seq)
app.start()
app.join()
alignments = app.get_alignments()
best_ali = alignments[0]
print(best_ali)
print()
print("HSP position in query: ", best_ali.query_interval)
print("HSP position in hit: ", best_ali.hit_interval)
print("Score: ", best_ali.score)
print("E-value: ", best_ali.e_value)
print("Hit UID: ", best_ali.hit_id)
print("Hit name: ", best_ali.hit_definition)

########################################################################
示例#26
0
    "R": -4.5
}

# Look for the Swiss-Prot entry contaning the human HCN1 channel
query =   entrez.SimpleQuery("HCN1", "Gene Name") \
        & entrez.SimpleQuery("h**o sapiens", "Organism") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch(uids[0],
                         biotite.temp_dir(),
                         "gp",
                         db_name="protein",
                         ret_type="gp")

gp_file = gb.GenBankFile.read(file_name)
hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp"))
print(hcn1)

########################################################################
# The positional hydropathy is calculated and smoothened using
# a moving average for clearer visualization.

hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1])


def moving_average(data_set, window_size):
    weights = np.full(window_size, 1 / window_size)
    return np.convolve(data_set, weights, mode='valid')


# Apply moving average over 15 amino acids for clearer visualization
示例#27
0
# :func:`NucleotideSequence.complement()` method.

# Lower case characters are automatically capitalized
seq1 = seq.NucleotideSequence("tacagtt")
print("Original:", seq1)
seq2 = seq1.reverse().complement()
print("Reverse complement:", seq2)

########################################################################
# The other :class:`Sequence` type is :class:`ProteinSequence`.
# It supports the letters for the 20 standard amino acids plus some
# letters for ambiguous amino acids and a letter for a stop signal.
# Furthermore, this class provides some utilities like
# 3-letter to 1-letter translation (and vice versa).

prot_seq = seq.ProteinSequence("BIQTITE")
print("-".join(
    [seq.ProteinSequence.convert_letter_1to3(symbol) for symbol in prot_seq]))

########################################################################
# A :class:`NucleotideSequence` can be translated into a
# :class:`ProteinSequence` via the
# :func:`NucleotideSequence.translate()` method.
# By default, the method searches for open reading frames (ORFs) in the
# 3 frames of the sequence.
# A 6-frame ORF search requires an
# additional call of :func:`NucleotideSequence.translate()` with the
# reverse complement of the sequence.
# If you want to conduct a complete 1-frame translation of the sequence,
# irrespective of any start and stop codons, set the parameter
# :obj:`complete` to true.
    annotation = gb.get_annotation(gb_file)
    
    # Find ID of strain in 'source' feature
    strain = None
    for feature in annotation:
        if feature.key == "source":
            strain = int(feature.qual["strain"])
    assert strain is not None
    
    # Find corresponding protein sequence in 'CDS' feature
    sequence = None
    for feature in annotation:
        if feature.key == "CDS":
            sequence = seq.ProteinSequence(
                # Remove whitespace in sequence
                # resulting from line breaks
                feature.qual["translation"].replace(" ", "")
            )
    assert sequence is not None

    sequences[strain] = sequence


# None of the THCA synthase variants have an insertion or deletion
# -> each one should have the same sequence length
seq_len = len(list(sequences.values())[0])
for sequence in sequences.values():
    assert len(sequence) == seq_len

# Create consensus sequences for the drug-type and fiber-type cannabis
# strains
示例#29
0
disulfide_bonds = detect_disulfide_bonds(knottin)
for sg1_index, sg2_index in disulfide_bonds:
    print(knottin[sg1_index])
    print(knottin[sg2_index])
    print()

########################################################################
# The found disulfide bonds are visualized with the help of
# *Matplotlib*:
# The amino acid sequence is written on the X-axis and the disulfide
# bonds are depicted by yellow semi-ellipses.

# Create a sequence object for each residue in the structure
# As we want each residue to appear only once in the sequence,
# we choose an atom that appears in each residue once: the CA
sequence = seq.ProteinSequence(knottin.res_name[knottin.atom_name == "CA"])
figure = plt.figure(figsize=(4.0, 1.0))
ax = figure.gca()
MARGIN = 0.2
ax.set_xlim(1-MARGIN, len(sequence)+MARGIN)
ax.set_ylim(0, 1+MARGIN)
ax.set_xticks(np.arange(1, len(sequence)+1))
ax.set_xticklabels(str(sequence))
ax.yaxis.set_tick_params(
    left=False, right=False, labelleft=False, labelright=False
)
ax.xaxis.set_tick_params(
    bottom=True, top=False, labelbottom=True, labeltop=False, width=0
)
ax.set_frame_on(False)
for sg1_index, sg2_index in disulfide_bonds:
示例#30
0
import biotite.sequence as seq
import biotite.application.blast as blast
import numpy as np
from requests.exceptions import ConnectionError
import pytest
import os.path
from ..util import data_dir, cannot_connect_to

BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"

# Start of E. coli lacZ ORF (UID: AJ308295)
dna_seq = seq.NucleotideSequence("ATGACCATGATTACGCCAAGCTTTCCGGGGAATTCA")

# Start of E. coli lacZ, translated dna_seq (UID: AJ308295)
prot_seq = seq.ProteinSequence("MTMITPSFPGNS")


@pytest.mark.skipif(cannot_connect_to(BLAST_URL),
                    reason="NCBI BLAST is not available")
def test_blastn():
    app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False)
    app.set_max_expect_value(100)
    app.start()
    app.join(timeout=300)
    alignments = app.get_alignments()
    # BLAST should find original sequence as best hit
    assert dna_seq == alignments[0].sequences[0]
    assert dna_seq == alignments[0].sequences[1]