def test_fetch_single_file(as_file_like): file_name = None if as_file_like else biotite.temp_file("fa") file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta") fasta_file = fasta.FastaFile.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def test_fetch_single_file(): file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], biotite.temp_file("fa"), "protein", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def test_fetch_single_file(as_file_like): if as_file_like: file_name = None else: file = tempfile.NamedTemporaryFile("r", suffix=".fa") file_name = file.name downloaded_file_name = entrez.fetch_single_file( ["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta" ) fasta_file = fasta.FastaFile.read(downloaded_file_name) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2 if not as_file_like: file.close()
show_line_position=show_line_position, spacing=spacing) twin = axes.get_shared_x_axes().get_siblings(axes)[0] for ax in (axes, twin): ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"}) axes.get_figure().patch.set_facecolor("#181818") # Using cyclotide sequences as example query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter") & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ entrez.SimpleQuery("Precursor")) uids = entrez.search(query, "protein") fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(uids, None, "protein", "fasta")) sequence_dict = fasta.get_sequences(fasta_file) headers = list(sequence_dict.keys()) sequences = list(sequence_dict.values()) labels = [header[-1] for header in headers] # Perform a multiple sequence alignment matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) # Order alignment according to guide tree alignment = alignment[:, order.tolist()] labels = [labels[i] for i in order] # Visualize the alignment using the new alignment plotter fig = plt.figure(figsize=(8.0, 3.7)) ax = fig.add_subplot(111)
# A list of valid database, retrieval type and mode combinations can # be found # `here <https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_. # Furthermore, :func:`get_database_name()` can be helpful to get the # required database name by the more commonly known names. print(entrez.get_database_name("Nucleotide")) ######################################################################## # The *Entrez* database allows for packing data for multiple UIDs into a # single file. This is achieved with the :func:`fetch_single_file()` # function. temp_file = NamedTemporaryFile(suffix=".fasta") file_path = entrez.fetch_single_file(["1L2Y_A", "1AKI_A"], temp_file.name, db_name="protein", ret_type="fasta") print(file_path) temp_file.close() ######################################################################## # Similar to the *RCSB PDB*, you can also search every # `field <https://www.ncbi.nlm.nih.gov/books/NBK49540/>`_ # of the *NCBI Entrez* database. # Search in all fields print(entrez.SimpleQuery("BL21 genome")) # Search in the 'Organism' field print(entrez.SimpleQuery("Escherichia coli", field="Organism")) ########################################################################
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import matplotlib.pyplot as plt import biotite import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics # Download and parse protein sequences of avidin and streptavidin file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"], biotite.temp_file("sequences.fasta"), "protein", "fasta") file = fasta.FastaFile.read(file_name) for name, sequence in file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: streptavidin_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1),
show_line_position=show_line_position, spacing=spacing) twin = axes.get_shared_x_axes().get_siblings(axes)[0] for ax in (axes, twin): ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"}) axes.get_figure().patch.set_facecolor("#181818") # Using cyclotide sequences as example query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter") & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ entrez.SimpleQuery("Precursor")) uids = entrez.search(query, "protein") fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(uids, biotite.temp_file("fa"), "protein", "fasta")) sequence_dict = fasta.get_sequences(fasta_file) headers = list(sequence_dict.keys()) sequences = list(sequence_dict.values()) labels = [header[-1] for header in headers] # Perform a multiple sequence alignment matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) # Order alignment according to guide tree alignment = alignment[:, order.tolist()] labels = [labels[i] for i in order] # Visualize the alignment using the new alignment plotter fig = plt.figure(figsize=(8.0, 3.7)) ax = fig.add_subplot(111)
######################################################################## # A list of valid database, retrieval type and mode combinations can # be found # `here <https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_. # Furthermore :func:`get_database_name()` can be helpful to get the # required database name by the more commonly known names. print(entrez.get_database_name("Nucleotide")) ######################################################################## # The *Entrez* database allows for packing data for multiple UIDs into a # single file. This is achieved with the :func:`fetch_single_file()` # function. file_path = entrez.fetch_single_file(["1L2Y_A", "1AKI_A"], biotite.temp_file("fa"), db_name="protein", ret_type="fasta") print(relpath(file_path)) ######################################################################## # Similar to the *RCSB PDB*, you can also search in the *NCBI Entrez* # database, but in an even more powerful manner: # Due to the simple design of the search queries accepted by # *NCBI Entrez*, you can search in every # `field <https://www.ncbi.nlm.nih.gov/books/NBK49540/>`_ # of the database. # Search in all fields print(entrez.SimpleQuery("BL21 genome")) # Search in the 'Organism' field print(entrez.SimpleQuery("Escherichia coli", field="Organism"))
import numpy as np import matplotlib.pyplot as plt from matplotlib.collections import LineCollection import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as seqgraphics import biotite.sequence.align as align import biotite.database.entrez as entrez import biotite.structure.graphics as strucgraphics import biotite.application.viennarna as viennarna # Download Escherichia coli BL21 and Salmonella enterica genome gb_file = gb.MultiFile.read( entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb")) ec_file, se_file = tuple(gb_file) annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"]) # Find M1 gene in E. coli genome via its annotation for feature in annot_seq.annotation: if "product" in feature.qual and "RNase P" in feature.qual["product"]: m1_sequence = annot_seq[feature] # Get S. enterica genome sequence se_genome = gb.get_sequence(se_file) # We want to search in the genome sequence and its reverse complement genomic_seqs = [se_genome, se_genome.reverse().complement()] ########################################################################
r"$\sigma^{32}$": "rpoH", r"$\sigma^{38}$": "rpoS", }) # Find SwissProt entries for these genes in NCBI Entrez protein database uids = [] for name, gene in genes.items(): query = entrez.SimpleQuery(gene, "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \ & entrez.SimpleQuery("Escherichia coli K-12", "Organism") ids = entrez.search(query, "protein") # Only one entry per gene in E. coli K-12 is expected assert len(ids) == 1 uids += ids # Download corresponding GenBank files as single, merged file file = entrez.fetch_single_file(uids, None, "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor seq_lengths = np.zeros(len(genes), dtype=int) # Read the merged file containing multiple GenBank entries multi_file = gb.MultiFile.read(file) # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file) seq_lengths[i] = length annotation = gb.get_annotation(gb_file)
import biotite.sequence.graphics as graphics UNIPROT_IDS = dict( hHCN1="O60741", hHCN2="Q9UL51", hHCN3="Q9P1Z3", hHCN4="Q9Y3Q4", spHCN="O76977", hEAG1="O95259", hERG1="Q12809", KAT1="Q39128", ) ### fetch sequences for UniProt IDs from NCBI Entrez fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(list(UNIPROT_IDS.values()), None, "protein", "fasta")) sequences = { name: seq.ProteinSequence(seq_str) for name, seq_str in zip(UNIPROT_IDS.keys(), fasta_file.values()) } ### create a simple phylogenetic tree # create MSA alignment = clustalo.ClustalOmegaApp.align(list(sequences.values())) # build simple tree based on deviation from sequence identity distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") tree = phylo.upgma(distances) ### plot the tree fig, ax = plt.subplots(1, 1, figsize=(8, 5))
import numpy as np import matplotlib.pyplot as plt import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database query = entrez.SimpleQuery("lexA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") # Search for the first 200 hits # More than 200 UIDs are not recommended for the EFetch service # for a single fetch uids = entrez.search(query, db_name="protein", number=200) file = entrez.fetch_single_file(uids, None, db_name="protein", ret_type="gp") # The file contains multiple concatenated GenPept files # -> Usage of MultiFile multi_file = gb.MultiFile.read(file) # Separate MultiFile into single GenBankFile instances files = [f for f in multi_file] print("Definitions:") for file in files[:20]: print(gb.get_definition(file)) print() print("Sources:") for file in files[:20]: print(gb.get_source(file)) ######################################################################## # The names of the sources are too long to be properly displayed later
from scipy.stats import linregress import biotite import biotite.sequence as seq import biotite.sequence.align as align from biotite.sequence.align.alignment import score import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics GAP_PENALTY = (-12, -1) # Download and parse protein sequences of avidin and streptavidin fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( ["CAC34569", "ACL82594"], None, "protein", "fasta" )) for name, sequence in fasta_file.items(): if "CAC34569" in name: query_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: hit_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignment = align.align_optimal( query_seq, hit_seq, matrix,
import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez import biotite.application.clustalo as clustalo # Search for DNA sequences that belong to the cited article query = entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \ & entrez.SimpleQuery("159", "Volume") \ & entrez.SimpleQuery("132-140", "Page Number") uids = entrez.search(query, db_name="nuccore") # Download and read file containing the Genbank records for the THCA # synthase genes multi_file = gb.MultiFile() multi_file.read( entrez.fetch_single_file(uids, file_name=None, db_name="nuccore", ret_type="gb")) # This dictionary maps the strain ID to the protein sequence sequences = {} for gb_file in multi_file: annotation = gb.get_annotation(gb_file) # Find ID of strain in 'source' feature strain = None for feature in annotation: if feature.key == "source": strain = int(feature.qual["strain"]) assert strain is not None
import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez # Generate example alignment # (the same as in the bacterial luciferase example) query = entrez.SimpleQuery("luxA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") uids = entrez.search(query, db_name="protein") file_name = entrez.fetch_single_file(uids, biotite.temp_file("fasta"), db_name="protein", ret_type="fasta") fasta_file = fasta.FastaFile.read(file_name) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) # Order alignment according to the guide tree alignment = alignment[:, order] alignment = alignment[220:300] # Get color scheme names alphabet = seq.ProteinSequence.alphabet schemes = [ "rainbow", "clustalx", "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean", "zappo", "taylor", "buried", "hydrophobicity", "prophelix", "propstrand", "propturn"
if SPECIES in line: # Uniprot/NCBI ID in second column, surrounded by brackets ncbi_id = line.split()[1].replace("(", "").replace(")", "") # Gene is surrounded by square brackets gene = line[gene_start : gene_end+1] \ .replace("[","").replace("]","") # Sometimes alternative gene names are separated via a # semicolon -> Choose the first gene name gene = gene.split(";")[0].strip() genes.append(gene) ids.append(ncbi_id) # Download sequences a file-like object and read the sequences from it fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(ids, file_name=None, db_name="protein", ret_type="fasta")) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] # Create multiple sequence alignment with Clustal Omega alignment = clustalo.ClustalOmegaApp.align(sequences) # The distance measure required for the tree calculation is the # percentage of non-identical amino acids in the respective two # sequences distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") # Create tree via neighbor joining tree = phylo.neighbor_joining(distances) # Convert to NetworkX graph #For the graph visualization, the edge directions are unnecessary graph = tree.as_graph().to_undirected()
import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez import biotite.application.clustalo as clustalo # Search for DNA sequences that belong to the cited article query = entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \ & entrez.SimpleQuery("159", "Volume") \ & entrez.SimpleQuery("132-140", "Page Number") uids = entrez.search(query, db_name="nuccore") # Download and read file containing the Genbank records for the THCA # synthase genes multi_file = gb.MultiFile.read(entrez.fetch_single_file( uids, file_name=None, db_name="nuccore", ret_type="gb" )) # This dictionary maps the strain ID to the protein sequence sequences = {} for gb_file in multi_file: annotation = gb.get_annotation(gb_file) # Find ID of strain in 'source' feature strain = None for feature in annotation: if feature.key == "source": strain = int(feature.qual["strain"]) assert strain is not None
r"$\sigma^{38}$": "rpoS", }) # Find SwissProt entries for these genes in NCBI Entrez protein database uids = [] for name, gene in genes.items(): query = entrez.SimpleQuery(gene, "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \ & entrez.SimpleQuery("Escherichia coli K-12", "Organism") ids = entrez.search(query, "protein") # Only one entry per gene in E. coli K-12 is expected assert len(ids) == 1 uids += ids # Download corresponding GenBank files as single, merged file file_name = entrez.fetch_single_file(uids, biotite.temp_file("gb"), "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor seq_lengths = np.zeros(len(genes), dtype=int) # Read the merged file containing multiple GenBank entries multi_file = gb.MultiFile() multi_file.read(file_name) # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file)
with streptavidin (*Streptomyces lavendulae*). """ # Code source: Patrick Kunzmann # License: BSD 3 clause import matplotlib.pyplot as plt import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics # Download and parse protein sequences of avidin and streptavidin fasta_file = fasta.FastaFile.read( entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta")) for name, sequence in fasta_file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) elif "ACL82594" in name: streptavidin_seq = seq.ProteinSequence(sequence) # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False)