def test_conversion_highlevel(path): """ Test whether the high-level GenBank interface can properly read the locus, annotation and sequence from GenBank file and write these properties to a file, without data changing. """ suffix = path[-2:] gb_file = gb.GenBankFile() gb_file.read(path) ref_locus = gb.get_locus(gb_file) ref_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) gb_file = gb.GenBankFile() gb.set_locus(gb_file, *ref_locus) gb.set_annotated_sequence(gb_file, ref_annot_seq) temp_file_name = biotite.temp_file("gb") gb_file.write(temp_file_name) gb_file = gb.GenBankFile() gb_file.read(temp_file_name) test_locus = gb.get_locus(gb_file) test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) assert test_locus == ref_locus assert test_annot_seq.sequence == ref_annot_seq.sequence assert test_annot_seq.annotation == ref_annot_seq.annotation assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
def test_genbank_conversion(): gb_file = gb.GenBankFile() gb_file.read(join(data_dir, "ec_bl21.gb")) assert gb_file.get_locus()["length"] == "4558953" assert gb_file.get_locus()["type"] == "DNA circular" assert gb_file.get_definition() == ("Escherichia coli BL21(DE3), " "complete genome.") assert gb_file.get_version() == "CP001509.3" assert gb_file.get_gi() == "296142109" assert gb_file.get_db_link() == { "BioProject": "PRJNA20713", "BioSample": "SAMN02603478" } assert len(gb_file.get_references()) == 5 for ref in gb_file.get_references()[1:]: assert ref["location"] == (1, 4558953) assert ref["journal"].endswith("Republic of Korea") assert gb_file.get_comment() == ("On May 17, 2010 this sequence version " "replaced CP001509.2. Bacteria available " "from F. William Studier " "(studier\x40bnl.gov).") annotation = gb_file.get_annotation(include_only=["CDS"]) feature = annotation.get_features()[5] assert feature.key == "CDS" assert feature.qual["gene"] == "yaaA" assert feature.qual["transl_table"] == "11" assert str(feature.locs[0]) == "< 5681-6457"
def test_genbank_consistency(path): """ Test whether the same annotation (if reasonable) can be read from a GFF3 file and a GenBank file. """ file = gb.GenBankFile() file.read(join(data_dir, path)) ref_annot = gb.get_annotation(file) file = gff.GFFFile() file.read(join(data_dir, path[:-3] + ".gff3")) test_annot = gff.get_annotation(file) # Remove qualifiers, since they will be different # in GFF3 and GenBank ref_annot = seq.Annotation( [seq.Feature(feature.key, feature.locs) for feature in ref_annot] ) test_annot = seq.Annotation( [seq.Feature(feature.key, feature.locs) for feature in test_annot] ) for feature in test_annot: # Only CDS, gene, intron and exon should be equal # in GenBank and GFF3 if feature.key in ["CDS", "gene", "intron", "exon"]: try: assert feature in test_annot except AssertionError: print(feature.key) for loc in feature.locs: print(loc) raise
def test_genbank_utility_gp(): """ Check whether the high-level utility functions return the expected content of a known GenPept file. """ gp_file = gb.GenBankFile() gp_file.read(join(data_dir, "bt_lysozyme.gp")) #[print(e) for e in gp_file._field_pos] assert gb.get_locus(gp_file) \ == ("AAC37312", 147, "", False, "MAM", "27-APR-1993") assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]." assert gb.get_version(gp_file) == "AAC37312.1" assert gb.get_gi(gp_file) == 163334 annotation = gb.get_annotation(gp_file) feature = seq.Feature("Site", [ seq.Location(start, stop) for start, stop in zip([52, 55, 62, 76, 78, 81, 117, 120, 125], [53, 55, 62, 76, 78, 81, 117, 120, 126]) ], { "note": "lysozyme catalytic cleft [active]", "site_type": "active" }) in_annotation = False for f in annotation: if f.key == feature.key and f.locs == feature.locs and \ all([(key, val in f.qual.items()) for key, val in feature.qual.items()]): in_annotation = True assert in_annotation assert len(gb.get_sequence(gp_file, format="gp")) == 147
def test_genbank_utility_gb(): """ Check whether the high-level utility functions return the expected content of a known GenBank file. """ gb_file = gb.GenBankFile() gb_file.read(join(data_dir, "ec_bl21.gb")) assert gb.get_locus(gb_file) \ == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017") assert gb.get_definition(gb_file) \ == ("Escherichia coli BL21(DE3), complete genome.") assert gb.get_version(gb_file) == "CP001509.3" assert gb.get_gi(gb_file) == 296142109 assert gb.get_db_link(gb_file) \ == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"} annotation = gb.get_annotation(gb_file, include_only=["CDS"]) feature = seq.Feature( "CDS", [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)], { "gene": "yaaA", "transl_table": "11" }) in_annotation = False for f in annotation: if f.key == feature.key and f.locs == feature.locs and \ all([(key, val in f.qual.items()) for key, val in feature.qual.items()]): in_annotation = True assert in_annotation assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
def test_conversion_lowlevel(path): """ Test whether the low-level GenBank interface can properly read a GenBank file and write a file, without data changing. """ gb_file = gb.GenBankFile() gb_file.read(path) ref_parsed_fields = [field for field in gb_file] gb_file = gb.GenBankFile() for name, content, subfields in ref_parsed_fields: gb_file.append(name, content, subfields) temp_file_name = biotite.temp_file("gb") gb_file.write(temp_file_name) gb_file = gb.GenBankFile() gb_file.read(temp_file_name) test_parsed_fields = [field for field in gb_file] assert test_parsed_fields == ref_parsed_fields
def test_contiguous_field_pos(path): """ Check whether the internal index of a GenBankFile is contiguous """ gb_file = gb.GenBankFile() gb_file.read(path) assert gb_file._field_pos[0][0] == 0 for i in range(1, len(gb_file._field_pos)): start, _, _ = gb_file._field_pos[i] _, stop, _ = gb_file._field_pos[i - 1] assert start == stop
def fetch_gb_annotation(pdb_chain=str): # input line retained for debugging # pdb_chain = "6FRH_A" # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) return annotation
def test_file_access(): """ Test getting, setting, deleting and inserting fields in a GenBank file. """ gb_file = gb.GenBankFile() gb_file.append("SOMEFIELD", ["Some content", "some other content"]) gb_file.insert(0, "OTHERFIELD", ["Additional content"]) assert gb_file[1] \ == ("SOMEFIELD", ["Some content", "some other content"], {}) gb_file[1] \ = "NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]} gb_file.append("THIRDFIELD", ["Supplementary content"]) assert len(gb_file) == 3 assert gb_file[0] == ("OTHERFIELD", ["Additional content"], {}) del gb_file[0] assert gb_file[0] \ == ("NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]}) del gb_file[0] assert gb_file[0] == ("THIRDFIELD", ["Supplementary content"], {}) del gb_file[0] assert len(gb_file) == 0
# Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.sequence.align as align import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annot_seq = gb_file.get_annotated_sequence(include_only=["gene"]) # Find leuL gene for feature in annot_seq.annotation: if "gene" in feature.qual and feature.qual["gene"] == "leuL": leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download Salmonella enterica genome without annotations file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name) se_genome = fasta.get_sequence(fasta_file)
def test_reverse_complement(): gb_file = gb.GenBankFile() gb_file.read(join(data_dir, "ec_bl21.gb")) annot_seq = gb.get_annotated_sequence(gb_file) assert annot_seq == annot_seq.reverse_complement().reverse_complement()