def test_conversion_highlevel(path): """ Test whether the high-level GenBank interface can properly read the locus, annotation and sequence from GenBank file and write these properties to a file, without data changing. """ suffix = path[-2:] gb_file = gb.GenBankFile.read(path) ref_locus = gb.get_locus(gb_file) ref_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) gb_file = gb.GenBankFile() gb.set_locus(gb_file, *ref_locus) gb.set_annotated_sequence(gb_file, ref_annot_seq) temp = TemporaryFile("w+") gb_file.write(temp) temp.seek(0) gb_file = gb.GenBankFile.read(temp) temp.close() test_locus = gb.get_locus(gb_file) test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) assert test_locus == ref_locus assert test_annot_seq.sequence == ref_annot_seq.sequence assert test_annot_seq.annotation == ref_annot_seq.annotation assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
# coding region (at the terminator signal), # hence ``BEYOND_RIGHT`` is applied. # These two defects are also reflected in the *mRNA* feature. # # Annotated sequences # ^^^^^^^^^^^^^^^^^^^ # # Now, that you have understood what annotations are, we proceed to the # next topic: annotated sequences. # An :class:`AnnotatedSequence` is like an annotation, but the sequence # is included this time. # Since our GenBank file contains the # sequence corresponding to the feature table, we can directly obtain the # :class:`AnnotatedSequence`. annot_seq = gb.get_annotated_sequence(file) print("Same annotation as before?", (annotation == annot_seq.annotation)) print(annot_seq.sequence[:60], "...") ######################################################################## # When indexing an :class:`AnnotatedSequence` with a slice, # the index is applied to the :class:`Annotation` and the # :class:`Sequence`. # While the :class:`Annotation` handles the index as shown before, # the :class:`Sequence` is indexed based on the sequence start # value (usually *1*). print("Sequence start before indexing:", annot_seq.sequence_start) for feature in annot_seq.annotation: if feature.key == "regulatory" \ and feature.qual["regulatory_class"] == "polyA_signal_sequence":
def test_reverse_complement(): gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) annot_seq = gb.get_annotated_sequence(gb_file) assert annot_seq == annot_seq.reverse_complement().reverse_complement()
# So we use a set to store the source name of sequences we already # listed and ignore all further occurences of that source species. # List of sequences binding_sites = [] # List of source species sources = [] # Set for ignoring already listed sources listed_sources = set() for file, source in zip(files, all_sources): if source in listed_sources: # Ignore already listed species continue bind_feature = None annot_seq = gb.get_annotated_sequence( file, include_only=["Site"], format="gp" ) # Find the feature for DNA-binding site for feature in annot_seq.annotation: # DNA binding site is a helix-turn-helix motif if "site_type" in feature.qual \ and feature.qual["site_type"] == "DNA binding" \ and "H-T-H motif" in feature.qual["note"]: bind_feature = feature if bind_feature is not None: # If the feature is found, # get the sequence slice that is defined by the feature... binding_sites.append(annot_seq[bind_feature]) # ...and save the respective source species sources.append(source) listed_sources.add(source)
)[0] identity = align.get_sequence_identity(genome_alignment, 'all') print(f"Sequence identity: {identity * 100:.2f} %") ######################################################################## # Now we would like to have a closer look at the mutation locations. # To contextualize the locations we plot the mutation frequency along # with the gene locations. # The genomic coordinates for each gene can be extracted from the # already downloaded *GenBank* file of the reference genome. N_BINS = 50 # Get genomic coordinates for all SARS-Cov-2 genes gb_file = gb.GenBankFile.read(orig_genome_file) annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"]) # Calculate the sequence identity within each bin bin_identities = np.zeros(N_BINS) edges = np.linspace(0, len(orig_genome), N_BINS+1) for i, (bin_start, bin_stop) in enumerate(zip(edges[:-1], edges[1:])): orig_genome_trace = genome_alignment.trace[:,1] excerpt = genome_alignment[ (orig_genome_trace >= bin_start) & (orig_genome_trace < bin_stop) ] bin_identities[i] = align.get_sequence_identity(excerpt, "all") fig, (deviation_ax, feature_ax) = plt.subplots(nrows=2, figsize=(8.0, 5.0)) # Plot the deviation = 1 - sequence identity
# Code source: Patrick Kunzmann # License: BSD 3 clause import itertools import numpy as np import biotite import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", biotite.temp_dir(), "gb", "nuccore", "gb")) # We are only interested in CDS features k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) ######################################################################## # As expected the dictionary encodes each codon as tuple of 3 numbers, # where ``0`` represents ``A``, ``1`` ``C``, ``2`` ``G`` and ``3`` ``T``.
import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as seqgraphics import biotite.sequence.align as align import biotite.database.entrez as entrez import biotite.structure.graphics as strucgraphics import biotite.application.viennarna as viennarna # Download Escherichia coli BL21 and Salmonella enterica genome gb_file = gb.MultiFile.read( entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb")) ec_file, se_file = tuple(gb_file) annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"]) # Find M1 gene in E. coli genome via its annotation for feature in annot_seq.annotation: if "product" in feature.qual and "RNase P" in feature.qual["product"]: m1_sequence = annot_seq[feature] # Get S. enterica genome sequence se_genome = gb.get_sequence(se_file) # We want to search in the genome sequence and its reverse complement genomic_seqs = [se_genome, se_genome.reverse().complement()] ######################################################################## # In an initial fast matching step, we look for matching *k-mers* # between *M1* and the *S. enterica* genome. # A matching *k-mer* is a length *k* subsequence, that appears in both