def plot_pb_scheme_alignment(): random.seed(1) scheme_file = biotite.temp_file("json") mat_file = biotite.temp_file("mat") with open(mat_file, "w") as file: # PB substitution matrix, adapted from PBxplore file.write(""" a b c d e f g h i j k l m n o p a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 """) gecli.main(args=[ "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast", "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file ]) colors = graphics.load_color_scheme(scheme_file)["colors"] fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.gca() pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") fasta_file = fasta.FastaFile() fasta_file.read(PB_EXAMPLE_FILE_NAME) seq_strings = list(fasta_file.values()) sequences = [ seq.GeneralSequence(pb_alphabet, seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) graphics.plot_alignment_type_based(ax, alignment, symbols_per_line=60, spacing=2, color_scheme=colors) fig.tight_layout() return fig
def plot_show_score(): scheme_file = biotite.temp_file("json") gecli.main(args=[ "--seed", "0", "--show-score", "--smin", "30", "--lmin", "60", "--lmax", "70", "-f", scheme_file ]) return plt.gcf()
def test_fetch_single_file(as_file_like): file_name = None if as_file_like else biotite.temp_file("fa") file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta") fasta_file = fasta.FastaFile.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def plot_high_contrast_scheme_alignment(): scheme_file = biotite.temp_file("json") gecli.main(args=[ "--seed", "0", "--contrast", "2000", "--lmin", "60", "--lmax", "75", "-f", scheme_file ]) return show_alignment(scheme_file)
def plot_main_example_alignment(): scheme_file = biotite.temp_file("json") gecli.main(args=[ "--seed", "0", "--matrix", "BLOSUM62", "--lmin", "60", "--lmax", "75", "-f", scheme_file ]) return show_alignment(scheme_file)
def test_hbond_structure(pdb_id): file_name = join(data_dir("structure"), pdb_id + ".mmtf") array = load_structure(file_name) # Only consider amino acids for consistency # with bonded hydrogen detection in MDTraj array = array[..., struc.filter_amino_acids(array)] if isinstance(array, struc.AtomArrayStack): # For consistency with MDTraj 'S' cannot be acceptor element # https://github.com/mdtraj/mdtraj/blob/master/mdtraj/geometry/hbond.py#L365 triplets, mask = struc.hbond(array, acceptor_elements=("O", "N")) else: triplets = struc.hbond(array, acceptor_elements=("O", "N")) # Save to new pdb file for consistent treatment of inscode/altloc # im MDTraj file_name = biotite.temp_file("pdb") save_structure(file_name, array) # Compare with MDTraj import mdtraj traj = mdtraj.load(file_name) triplets_ref = mdtraj.baker_hubbard(traj, freq=0, periodic=False) # Both packages may use different order # -> use set for comparison triplets_set = set([tuple(tri) for tri in triplets]) triplets_ref_set = set([tuple(tri) for tri in triplets_ref]) assert triplets_set == triplets_ref_set
def test_pdb_to_gro(path, single_model): # Converting stacks between formats should not change data model = 1 if single_model else None # Read in data pdb_file = pdb.PDBFile() pdb_file.read(path) a1 = pdb_file.get_structure(model=model) # Save stack as gro tmp_file_name = biotite.temp_file("gro") gro_file = gro.GROFile() gro_file.set_structure(a1) gro_file.write(tmp_file_name) # Reload stack from gro gro_file = gro.GROFile() gro_file.read(tmp_file_name) a2 = gro_file.get_structure(model=model) assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() # Mind rounding errors when converting pdb to gro (A -> nm) assert a1.coord.flatten().tolist() \ == approx(a2.coord.flatten().tolist(), abs=1e-2)
def test_dihedral_backbone_result(file_name): import mdtraj mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) array = mmtf.get_structure(mmtf_file, model=1) array = array[struc.filter_amino_acids(array)] for chain in struc.chain_iter(array): print("Chain: ", chain.chain_id[0]) if len(struc.check_id_continuity(chain)) != 0: # Do not test discontinuous chains return test_phi, test_psi, test_ome = struc.dihedral_backbone(chain) temp_file_name = biotite.temp_file("pdb") strucio.save_structure(temp_file_name, chain) traj = mdtraj.load(temp_file_name) _, ref_phi = mdtraj.compute_phi(traj) _, ref_psi = mdtraj.compute_psi(traj) _, ref_ome = mdtraj.compute_omega(traj) ref_phi, ref_psi, ref_ome = ref_phi[0], ref_psi[0], ref_ome[0] assert test_phi[1:] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3) assert test_psi[:-1] == pytest.approx(ref_psi, abs=1e-5, rel=5e-3) assert test_ome[:-1] == pytest.approx(ref_ome, abs=1e-5, rel=5e-3)
def plot_high_saturation_scheme_alignment(): scheme_file = biotite.temp_file("json") gecli.main(args=[ "--seed", "0", "--smin", "30", "--lmin", "55", "--lmax", "75", "-f", scheme_file ]) return show_alignment(scheme_file)
def test_conversion_highlevel(path): """ Test whether the high-level GenBank interface can properly read the locus, annotation and sequence from GenBank file and write these properties to a file, without data changing. """ suffix = path[-2:] gb_file = gb.GenBankFile() gb_file.read(path) ref_locus = gb.get_locus(gb_file) ref_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) gb_file = gb.GenBankFile() gb.set_locus(gb_file, *ref_locus) gb.set_annotated_sequence(gb_file, ref_annot_seq) temp_file_name = biotite.temp_file("gb") gb_file.write(temp_file_name) gb_file = gb.GenBankFile() gb_file.read(temp_file_name) test_locus = gb.get_locus(gb_file) test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) assert test_locus == ref_locus assert test_annot_seq.sequence == ref_annot_seq.sequence assert test_annot_seq.annotation == ref_annot_seq.annotation assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
def plot_no_green_scheme_alignment(): scheme_file = biotite.temp_file("json") gecli.main(args=[ "--seed", "0", "--amin", "0", "--lmin", "50", "--lmax", "80", "-f", scheme_file ]) return show_alignment(scheme_file)
def test_conversion_highlevel(path): """ Test whether the high-level GFF3 interface can properly read the features from a GFF3 file and write these properties to a file without data changing. The 'phase' is tested additionally, since it is not part of a `Feature` object. """ file = gff.GFFFile.read(join(data_dir("sequence"), path)) ref_annot = gff.get_annotation(file) ref_phases = [] for _, _, type, _, _, _, _, phase, _ in file: if type == "CDS": ref_phases.append(phase) file = gff.GFFFile() gff.set_annotation(file, ref_annot) temp_file_name = biotite.temp_file("gff3") file.write(temp_file_name) file = gff.GFFFile.read(temp_file_name) test_annot = gff.get_annotation(file) test_phases = [] for _, _, type, _, _, _, _, phase, _ in file: if type == "CDS": test_phases.append(phase) assert ref_annot == test_annot assert test_phases == ref_phases
def test_array_conversion(format): template = strucio.load_structure(join(data_dir, "1l2y.mmtf"))[0] # Add fake box template.box = np.diag([1, 2, 3]) if format == "trr": traj_file_cls = trr.TRRFile if format == "xtc": traj_file_cls = xtc.XTCFile if format == "tng": traj_file_cls = tng.TNGFile if format == "dcd": traj_file_cls = dcd.DCDFile if format == "netcdf": traj_file_cls = netcdf.NetCDFFile traj_file = traj_file_cls() traj_file.read(join(data_dir, f"1l2y.{format}")) ref_array = traj_file.get_structure(template) traj_file = traj_file_cls() traj_file.set_structure(ref_array) file_name = biotite.temp_file(format) traj_file.write(file_name) traj_file = traj_file_cls() traj_file.read(file_name) array = traj_file.get_structure(template) assert ref_array.bonds == array.bonds assert ref_array.equal_annotation_categories(array) assert ref_array.box == pytest.approx(array.box) assert ref_array.coord == pytest.approx(array.coord, abs=1e-2)
def create(pdb_id, directory, include_gro): # Create *.pdb", *.cif and *.mmtf for file_format in ["pdb", "cif", "mmtf"]: rcsb.fetch(pdb_id, file_format, directory, overwrite=True) try: array = strucio.load_structure(join(directory, pdb_id + ".pdb")) except biotite.InvalidFileError: # Structure probably contains multiple models with different # number of atoms # -> Cannot load AtomArrayStack # -> Skip writing GRO and NPZ file return # Create *.gro file strucio.save_structure(join(directory, pdb_id + ".npz"), array) # Create *.gro files using GROMACS # Clean PDB file -> remove inscodes and altlocs if include_gro: cleaned_file_name = biotite.temp_file("pdb") strucio.save_structure(cleaned_file_name, array) # Run GROMACS for file conversion subprocess.run([ "editconf", "-f", cleaned_file_name, "-o", join(directory, pdb_id + ".gro") ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def plot_constrained_scheme_alignment(): scheme_file = biotite.temp_file("json") gecli.main(args=[ "--seed", "0", "-c", "A", "70", "0", "0", "-c", "W", "70", "-10", "-45", "--lmin", "60", "--lmax", "75", "-f", scheme_file ]) return show_alignment(scheme_file)
def plot_show_example(): random.seed(0) scheme_file = biotite.temp_file("json") gecli.main(args=[ "--show-example", "--smin", "30", "--lmin", "60", "--lmax", "70", "-f", scheme_file ]) return plt.gcf()
def test_fetch_single_file(): file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], biotite.temp_file("fa"), "protein", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def test_id_overflow(): # Create an atom array >= 100k atoms length = 100000 a = struc.AtomArray(length) a.coord = np.zeros(a.coord.shape) a.chain_id = np.full(length, "A") # Create residue IDs over 10000 a.res_id = np.arange(1, length + 1) a.res_name = np.full(length, "GLY") a.hetero = np.full(length, False) a.atom_name = np.full(length, "CA") a.element = np.full(length, "C") # Write stack to pdb file and make sure a warning is thrown with pytest.warns(UserWarning): tmp_file_name = biotite.temp_file(".pdb") tmp_pdb_file = pdb.PDBFile() tmp_pdb_file.set_structure(a) tmp_pdb_file.write(tmp_file_name) # Assert file can be read properly a2 = io.load_structure(tmp_file_name) assert (a2.array_length() == a.array_length()) # Manually check if the written atom id is correct with open(tmp_file_name) as output: last_line = output.readlines()[-1] atom_id = int(last_line.split()[1]) assert (atom_id == 1) # Write stack as hybrid-36 pdb file: no warning should be thrown with pytest.warns(None) as record: tmp_file_name = biotite.temp_file(".pdb") tmp_pdb_file = pdb.PDBFile() tmp_pdb_file.set_structure(a, hybrid36=True) tmp_pdb_file.write(tmp_file_name) assert len(record) == 0 # Manually check if the output is written as correct hybrid-36 with open(tmp_file_name) as output: last_line = output.readlines()[-1] atom_id = last_line.split()[1] assert (atom_id == "A0000") res_id = last_line.split()[4][1:] assert (res_id == "BXG0")
def test_numpy_objects(): """ Test whether the Msgpack encoder is able to handle NumPy values (e.g. np.float32) properly. """ mmtf_file = mmtf.MMTFFile() mmtf_file["A float"] = np.float32(42.0) mmtf_file["A list"] = [np.int64(1), np.int64(2), np.int64(3)] mmtf_file["A dictionary"] = {"a": np.bool(True), "b": np.bool(False)} mmtf_file.write(biotite.temp_file("mmtf"))
def test_conversion_lowlevel(path): """ Test whether the low-level GFF3 interface can properly read a GenBank file and write a file, without data changing. """ file = gff.GFFFile.read(join(data_dir("sequence"), path)) ref_entries = [entry for entry in file] file = gff.GFFFile() for entry in ref_entries: file.append(*entry) temp_file_name = biotite.temp_file("gff3") file.write(temp_file_name) file = gff.GFFFile.read(temp_file_name) test_entries = [field for field in file] assert test_entries == ref_entries
def create(pdb_id, directory, include_gro): # Create *.pdb", *.cif and *.mmtf for file_format in ["pdb", "cif", "mmtf"]: rcsb.fetch(pdb_id, file_format, directory) if include_gro: # Create *.gro files using GROMACS # Clean PDB file -> remove inscodes and altlocs array = strucio.load_structure(join(directory, pdb_id + ".pdb")) cleaned_file_name = biotite.temp_file("pdb") strucio.save_structure(cleaned_file_name, array) # Run GROMACS for file conversion subprocess.run([ "gmx", "editconf", "-f", cleaned_file_name, "-o", join(directory, pdb_id + ".gro") ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def test_conversion_lowlevel(path): """ Test whether the low-level GenBank interface can properly read a GenBank file and write a file, without data changing. """ gb_file = gb.GenBankFile() gb_file.read(path) ref_parsed_fields = [field for field in gb_file] gb_file = gb.GenBankFile() for name, content, subfields in ref_parsed_fields: gb_file.append(name, content, subfields) temp_file_name = biotite.temp_file("gb") gb_file.write(temp_file_name) gb_file = gb.GenBankFile() gb_file.read(temp_file_name) test_parsed_fields = [field for field in gb_file] assert test_parsed_fields == ref_parsed_fields
def test_conversion(chars_per_line): path = os.path.join(data_dir, "random.fastq") file1 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) file1.read(path) ref_content = dict(file1.items()) file2 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) for identifier, (sequence, scores) in ref_content.items(): file2[identifier] = sequence, scores file2.write(biotite.temp_file("fastq")) file3 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) file3.read(path) content = dict(file3.items()) for identifier in ref_content: ref_sequence, ref_scores = ref_content[identifier] sequence, scores = content[identifier] assert ref_sequence == sequence assert np.array_equal(ref_scores, scores)
def test_guess_elements(): # read valid pdb file path = join(data_dir("structure"), "1l2y.pdb") pdb_file = pdb.PDBFile.read(path) stack = pdb_file.get_structure() # remove all elements removed_stack = stack.copy() removed_stack.element[:] = '' # save stack without elements to tmp file tmp_file_name = biotite.temp_file(".pdb") tmp_pdb_file = pdb.PDBFile() tmp_pdb_file.set_structure(removed_stack) tmp_pdb_file.write(tmp_file_name) # read new stack from file with guessed elements guessed_pdb_file = pdb.PDBFile.read(tmp_file_name) guessed_stack = guessed_pdb_file.get_structure() assert guessed_stack.element.tolist() == stack.element.tolist()
def test_gro_no_box(): """ .gro file format requires valid box parameters at the end of each model. However, if we read such a file in, the resulting object should not have an assigned box. """ # Create an AtomArray atom = Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=1) atoms = array([atom]) # Write .gro file tmp_file_name = biotite.temp_file(".gro") io.save_structure(tmp_file_name, atoms) # Read in file gro_file = gro.GROFile.read(tmp_file_name) s = gro_file.get_structure() # Assert no box with 0 dimension assert s.box is None
def test_gro_id_overflow(): # Create an oversized AtomArray where atom_id > 100000 and res_id > 10000 num_atoms = 100005 atoms = array([ Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=i + 1) for i in range(num_atoms) ]) atoms.box = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # Write .gro file tmp_file_name = biotite.temp_file(".gro") io.save_structure(tmp_file_name, atoms) # Read .gro file gro_file = gro.GROFile.read(tmp_file_name) s = gro_file.get_structure() assert s.array_length() == num_atoms
def test_array_conversion(path, single_model): model = 1 if single_model else None mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) mmtf_file = mmtf.MMTFFile() mmtf.set_structure(mmtf_file, a1) temp_file_name = biotite.temp_file("mmtf") mmtf_file.write(temp_file_name) mmtf_file = mmtf.MMTFFile() mmtf_file.read(temp_file_name) a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3) assert a1.bonds == a2.bonds if a1.box is not None: assert np.allclose(a1.box, a2.box)
r"$\sigma^{38}$": "rpoS", }) # Find SwissProt entries for these genes in NCBI Entrez protein database uids = [] for name, gene in genes.items(): query = entrez.SimpleQuery(gene, "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \ & entrez.SimpleQuery("Escherichia coli K-12", "Organism") ids = entrez.search(query, "protein") # Only one entry per gene in E. coli K-12 is expected assert len(ids) == 1 uids += ids # Download corresponding GenBank files as single, merged file file_name = entrez.fetch_single_file(uids, biotite.temp_file("gb"), "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor seq_lengths = np.zeros(len(genes), dtype=int) # Read the merged file containing multiple GenBank entries multi_file = gb.MultiFile() multi_file.read(file_name) # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file):
It is basically very similar to using normal functions. In the following sections you will get an overview over the mentioned subpackages, so go and grab some tea and cookies und let us begin. Preliminary note ---------------- The files used in this tutorial will be stored in a temporary directory. The top level package :mod:`biotite` provides functionality to create a temporary directory, called ``.biotitetemp`` in your current working directory. You can either obtain the path to this directory via :func:`temp_dir` or directly create an unambiguous file name in this directory using :func:`temp_file`. In the end of the session the temporary directory and all its contents will be automatically deleted, so make sure to put the files, you want keep, somewhere else. """ from os.path import relpath import biotite # Create temporary directory dir_path = biotite.temp_dir() print(relpath(dir_path)) # Get a path to a temporary FASTA file # This would also create the temporary directory, # if it was not created, yet file_path = biotite.temp_file("fasta") print(relpath(file_path))
import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez # Generate example alignment # (the same as in the bacterial luciferase example) query = entrez.SimpleQuery("luxA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") uids = entrez.search(query, db_name="protein") file_name = entrez.fetch_single_file(uids, biotite.temp_file("fasta"), db_name="protein", ret_type="fasta") fasta_file = fasta.FastaFile.read(file_name) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) # Order alignment according to the guide tree alignment = alignment[:, order] alignment = alignment[220:300] # Get color scheme names alphabet = seq.ProteinSequence.alphabet schemes = [ "rainbow", "clustalx", "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean", "zappo", "taylor", "buried", "hydrophobicity",