# The names of the sigma factors and the corresponding genes genes = OrderedDict({ r"$\sigma^{70}$": "rpoD", r"$\sigma^{24}$": "rpoE", r"$\sigma^{28}$": "rpoF", r"$\sigma^{32}$": "rpoH", r"$\sigma^{38}$": "rpoS", }) # Find SwissProt entries for these genes in NCBI Entrez protein database uids = [] for name, gene in genes.items(): query = entrez.SimpleQuery(gene, "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \ & entrez.SimpleQuery("Escherichia coli K-12", "Organism") ids = entrez.search(query, "protein") # Only one entry per gene in E. coli K-12 is expected assert len(ids) == 1 uids += ids # Download corresponding GenBank files as single, merged file file_name = entrez.fetch_single_file(uids, biotite.temp_file("gb"), "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor
import numpy as np import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez # Generate example alignment # (the same as in the bacterial luciferase example) query = entrez.SimpleQuery("luxA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") uids = entrez.search(query, db_name="protein") file_name = entrez.fetch_single_file(uids, biotite.temp_file("fasta"), db_name="protein", ret_type="fasta") fasta_file = fasta.FastaFile.read(file_name) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) # Order alignment according to the guide tree alignment = alignment[:, order] alignment = alignment[220:300] # Get color scheme names alphabet = seq.ProteinSequence.alphabet schemes = [
import numpy as np import matplotlib.pyplot as plt import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database query = entrez.SimpleQuery("lexA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") # Search for the first 200 hits # More than 200 UIDs are not recommended for the EFetch service uids = entrez.search(query, db_name="protein", number=200) file_name = entrez.fetch_single_file(uids, biotite.temp_file("lexa.gb"), db_name="protein", ret_type="gb") # The file contains multiple concatenated GenPept files # -> Usage of MultiFile multi_file = gb.MultiFile("gp") multi_file.read(file_name) # Separate MultiFile into single GenPeptFile instances files = [f for f in multi_file] print("Definitions:") for file in files[:10]: print(file.get_definition()) print() print("Sources:") for file in files[:10]: print(file.get_source())
ret_type="fasta") print(file_path) temp_file.close() ######################################################################## # Similar to the *RCSB PDB*, you can also search every # `field <https://www.ncbi.nlm.nih.gov/books/NBK49540/>`_ # of the *NCBI Entrez* database. # Search in all fields print(entrez.SimpleQuery("BL21 genome")) # Search in the 'Organism' field print(entrez.SimpleQuery("Escherichia coli", field="Organism")) ######################################################################## # You can also combine multiple :class:`Query` objects in any way you # like using the binary operators ``|``, ``&`` and ``^``, # that represent ``OR``, ``AND`` and ``NOT`` linkage, respectively. composite_query = (entrez.SimpleQuery("50:100", field="Sequence Length") & (entrez.SimpleQuery("Escherichia coli", field="Organism") | entrez.SimpleQuery("Bacillus subtilis", field="Organism"))) print(composite_query) ######################################################################## # Finally, the query is given to the :func:`search()` function to obtain # the GIs, that can be used as input for :func:`fetch()`. # Return a maximum number of 10 entries gis = entrez.search(composite_query, "protein", number=10) print(gis)
import matplotlib.pyplot as plt from matplotlib.colors import LinearSegmentedColormap import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.io.genbank as gb import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez import biotite.application.clustalo as clustalo # Search for DNA sequences that belong to the cited article query = entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \ & entrez.SimpleQuery("159", "Volume") \ & entrez.SimpleQuery("132-140", "Page Number") uids = entrez.search(query, db_name="nuccore") # Download and read file containing the Genbank records for the THCA # synthase genes multi_file = gb.MultiFile.read(entrez.fetch_single_file( uids, file_name=None, db_name="nuccore", ret_type="gb" )) # This dictionary maps the strain ID to the protein sequence sequences = {} for gb_file in multi_file: annotation = gb.get_annotation(gb_file) # Find ID of strain in 'source' feature