Exemplo n.º 1
0
def test_sequence_conversion():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    # Cannot compare dicts directly, since the original RNA sequence is
    # now guessed as protein sequence
    for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
        assert str(seq1) == str(seq2)

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    # Expect a warning for selenocysteine conversion
    with pytest.warns(UserWarning):
        assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Exemplo n.º 2
0
def test_sequence_conversion():
    path = os.path.join(data_dir, "nuc.fasta")
    file = fasta.FastaFile()
    file.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir, "prot.fasta")
    file4 = fasta.FastaFile()
    file4.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir, "invalid.fasta")
    file5 = fasta.FastaFile()
    file5.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Exemplo n.º 3
0
def test_fetch(as_file_like):
    path = None if as_file_like else tempfile.gettempdir()

    # UniProtKB
    file = uniprot.fetch(
        "P12345", "fasta", path, overwrite=True
    )
    fasta_file = fasta.FastaFile.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
    assert len(prot_seq) == 430

    # UniRef
    file = uniprot.fetch(
        "UniRef90_P99999", "fasta", path, overwrite=True
    )
    fasta_file = fasta.FastaFile.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
    assert len(prot_seq) == 105

    # UniParc
    file = uniprot.fetch(
        "UPI000000001F", "fasta", path, overwrite=True
    )
    fasta_file = fasta.FastaFile.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
    assert len(prot_seq) == 551
Exemplo n.º 4
0
def test_fetch(common_name, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    db_name = "Protein" if common_name else "protein"
    file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
Exemplo n.º 5
0
def test_fetch():
    file = entrez.fetch("1L2Y_A",
                        biotite.temp_dir(),
                        "fa",
                        "protein",
                        "fasta",
                        overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
Exemplo n.º 6
0
def test_search_sequence():
    IDENTIY_CUTOFF = 0.9
    pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
    ref_sequence = pdbx.get_sequence(pdbx_file)[0]
    query = rcsb.SequenceQuery(ref_sequence,
                               "protein",
                               min_identity=IDENTIY_CUTOFF)
    test_ids = rcsb.search(query)

    for id in test_ids:
        fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta"))
        test_sequence = fasta.get_sequence(fasta_file)
        matrix = align.SubstitutionMatrix.std_protein_matrix()
        alignment = align.align_optimal(ref_sequence,
                                        test_sequence,
                                        matrix,
                                        terminal_penalty=False)[0]
        identity = align.get_sequence_identity(alignment, mode="shortest")
        assert identity >= IDENTIY_CUTOFF
Exemplo n.º 7
0
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annot_seq = gb_file.get_annotated_sequence(include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Use general gap penalty to save RAM
alignments = align.align_optimal(leul_seq,
                                 se_genome,
                                 matrix,
                                 gap_penalty=-7,
                                 local=True)
# Do the same for reverse complement genome
se_genome_rev = se_genome.reverse().complement()
rev_alignments = align.align_optimal(leul_seq,
                                     se_genome_rev,
                                     matrix,
                                     gap_penalty=-7,
                                     local=True)
Exemplo n.º 8
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.graphics as graphics
import biotite.application.muscle as muscle
import biotite.application.blast as blast
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download sequence of Streptococcus pyogenes Cas9
file_name = entrez.fetch("Q99ZW2", biotite.temp_dir(), "fa", "protein", "fasta")
file = fasta.FastaFile.read(file_name)
ref_seq = fasta.get_sequence(file)
# Find homologous proteins using NCBI Blast
# Search only the UniProt/SwissProt database
blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False)
blast_app.start()
blast_app.join()
alignments = blast_app.get_alignments()
# Get hit IDs for hits with score > 200
hits = []
for ali in alignments:
    if ali.score > 200:
        hits.append(ali.hit_id)
# Get the sequences from hit IDs
hit_seqs = []
for hit in hits:
    file_name = entrez.fetch(hit, biotite.temp_dir(), "fa", "protein", "fasta")
Exemplo n.º 9
0
for header, string in file.items():
    print("Header:", header)
    print(len(string))
    print("Sequence:", string[:50], "...")
    print("Sequence length:", len(string))

########################################################################
# Since there is only a single sequence in the file, the loop is run
# only one time.
# As the sequence string is very long, only the first 50 bp are printed.
# Now this string could be used as input parameter for creation of a
# :class:`NucleotideSequence`.
# But we want to spare ourselves some unnecessary work, there is already
# a convenience function for that:

dna_seq = fasta.get_sequence(file)
print(type(dna_seq).__name__)
print(dna_seq[:50])

########################################################################
# In this form :func:`get_sequence()` returns the first sequence in the
# file, which is also the only sequence in most cases.
# If you want the sequence corresponding to a specific header, you have
# to specifiy the :obj:`header` parameter.
# The function even automatically recognizes, if the file contains a
# DNA or protein sequence and returns a :class:`NucleotideSequence` or
# :class:`ProteinSequence`, instance respectively.
# Actually, it just tries to create a :class:`NucleotideSequence`,
# and if this fails, a :class:`ProteinSequence` is created instead.
#
# Sequences can be written into FASTA files in a similar way: either via
Exemplo n.º 10
0
opt_codons = {}
for amino_acid_code in range(20):
    codon_codes_for_aa = table[amino_acid_code]
    # Find codon with maximum frequency
    max_freq = 0
    best_codon_code = None
    for codon_code in codon_codes_for_aa:
        if codon_counter[codon_code] > max_freq:
            max_freq = codon_counter[codon_code]
            best_codon_code = codon_code
    # Map the amino acid to the codon with maximum frequency
    opt_codons[amino_acid_code] = best_codon_code

# Fetch the streptavidin protein sequence from Streptomyces avidinii
fasta_file = fasta.FastaFile.read(
    entrez.fetch("P22629", biotite.temp_dir(), "fasta", "protein", "fasta"))
strep_prot_seq = fasta.get_sequence(fasta_file)
# Create a DNA sequence from the protein sequence
# using the optimal codons
strep_dna_seq = seq.NucleotideSequence()
strep_dna_seq.code = np.concatenate(
    [opt_codons[amino_acid_code] for amino_acid_code in strep_prot_seq.code])
# Add stop codon
strep_dna_seq += seq.NucleotideSequence("TAA")
# Put the DNA sequence into a FASTA file
fasta_file = fasta.FastaFile()
fasta_file["Codon optimized streptavidin"] = str(strep_dna_seq)
# Print the contents of the created FASTA file
print(fasta_file)
# In a real application it would be written onto the hard drive via
# fasta_file.write("some_file.fasta")