def test_extra_fields(): path = join(data_dir("structure"), "1l2y.cif") pdbx_file = pdbx.PDBxFile.read(path) stack1 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, stack1, data_block="test") stack2 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) assert stack1 == stack2 path = join(data_dir("structure"), "1l2y.cif") pdbx_file = pdbx.PDBxFile.read(path) stack1 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, stack1, data_block="test") stack2 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) assert stack1.ins_code.tolist() == stack2.ins_code.tolist() assert stack1.atom_id.tolist() == stack2.atom_id.tolist() assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist()) assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist()) assert stack1.charge.tolist() == stack2.charge.tolist() assert stack1 == stack2
def test_extra_fields(): path = join(data_dir, "1l2y.cif") pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) stack1 = pdbx.get_structure(pdbx_file, extra_fields=["atom_id","b_factor", "occupancy","charge"]) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, stack1, data_block="test") stack2 = pdbx.get_structure(pdbx_file, extra_fields=["atom_id","b_factor", "occupancy","charge"]) assert stack1 == stack2
def test_conversion(path, single_model): model = 1 if single_model else None pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) array1 = pdbx.get_structure(pdbx_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, array1, data_block="test") array2 = pdbx.get_structure(pdbx_file, model=model) for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_conversion(path, model): pdbx_file = pdbx.PDBxFile.read(path) try: array1 = pdbx.get_structure(pdbx_file, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, array1, data_block="test") array2 = pdbx.get_structure(pdbx_file, model=model) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_conversion(path, model): pdbx_file = pdbx.PDBxFile.read(path) try: array1 = pdbx.get_structure(pdbx_file, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, array1, data_block="test") # Remove one optional auth section in label to test fallback to label fields atom_cat = pdbx_file.get_category("atom_site", "test") atom_cat.pop("auth_atom_id") pdbx_file.set_category("atom_site", atom_cat, "test") array2 = pdbx.get_structure(pdbx_file, model=model) assert array1.array_length() > 0 if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert (array1.get_annotation(category).tolist() == array2.get_annotation(category).tolist()) assert array1.coord.tolist() == array2.coord.tolist()
def create_bond_dict(components_pdbx_file_path, msgpack_file_path): pdbx_file = pdbx.PDBxFile() pdbx_file.read(components_pdbx_file_path) components = pdbx_file.get_block_names() bond_dict = {} for component in components: print(component) cif_bonds = pdbx_file.get_category("chem_comp_bond", block=component) if cif_bonds is None: # No bond info for this compound continue if isinstance(cif_bonds["comp_id"], str): # Single string -> single bond group_bonds = { (cif_bonds["atom_id_1"], cif_bonds["atom_id_2"]): BOND_ORDERS[cif_bonds["value_order"]] } else: # Looped values -> multiple bonds group_bonds = {(atom1, atom2): BOND_ORDERS[order] for atom1, atom2, order in zip( cif_bonds["atom_id_1"], cif_bonds["atom_id_2"], cif_bonds["value_order"])} bond_dict[component] = group_bonds with open(msgpack_file_path, "wb") as msgpack_file: msgpack.dump(bond_dict, msgpack_file)
def test_get_assembly(single_model): """ Test whether the :func:`get_assembly()` function produces the same number of peptide chains as the ``_pdbx_struct_assembly.oligomeric_count`` field indicates. """ model = 1 if single_model else None path = join(data_dir, "1f2n.cif") pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) assembly_category = pdbx_file.get_category("pdbx_struct_assembly", expect_looped=True) # Test each available assembly for id, ref_oligomer_count in zip(assembly_category["id"], assembly_category["oligomeric_count"]): assembly = pdbx.get_assembly(pdbx_file, assembly_id=id, model=model) protein_assembly = assembly[..., struc.filter_amino_acids(assembly)] test_oligomer_count = struc.get_chain_count(protein_assembly) if single_model: assert isinstance(assembly, struc.AtomArray) else: assert isinstance(assembly, struc.AtomArrayStack) assert test_oligomer_count == int(ref_oligomer_count)
def create_dict(components_pdbx_file_path, msgpack_file_path, subcategory, expected_type): pdbx_file = pdbx.PDBxFile() pdbx_file.read(components_pdbx_file_path) components = pdbx_file.get_block_names() data_dict = {} for i, component in enumerate(components): print(f"{((i+1) / len(components) * 100):4.1f} %", end="\r") try: cif_dict = pdbx_file.get_category("chem_comp", block=component) except ValueError: # The 'chem_comp' category may contain unparsable names # with wrong quote escaping # In this case the PDBx file parser raises an Exception cif_dict = None if cif_dict is None: # No or erroneous info for this compound data_dict[component] = None else: try: data = expected_type(cif_dict[subcategory]) except ValueError: # Unparsable data, e.g. '?' as float data = None data_dict[component] = data print() with open(msgpack_file_path, "wb") as msgpack_file: msgpack.dump(data_dict, msgpack_file)
def test_parsing(category, key, exp_value): pdbx_file = pdbx.PDBxFile() pdbx_file.read(join(data_dir, "1l2y.cif")) cat_dict = pdbx_file[category] value = cat_dict[key] if isinstance(value, np.ndarray): assert value.tolist() == exp_value else: assert value == exp_value
def test_superimposition_array(path): pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) fixed = pdbx.get_structure(pdbx_file, model=1) mobile = fixed.copy() mobile = struc.rotate(mobile, (1, 2, 3)) mobile = struc.translate(mobile, (1, 2, 3)) fitted, transformation = struc.superimpose(fixed, mobile, (mobile.atom_name == "CA")) assert struc.rmsd(fixed, fitted) == pytest.approx(0) fitted = struc.superimpose_apply(mobile, transformation) assert struc.rmsd(fixed, fitted) == pytest.approx(0)
def test_pdbx_consistency(path, single_model): model = 1 if single_model else None cif_path = splitext(path)[0] + ".cif" pdb_file = pdb.PDBFile() pdb_file.read(path) a1 = pdb_file.get_structure(model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.tolist() == a2.coord.tolist()
def test_unequal_lengths(): valid_category_dict = {"foo1": ["1", "2", "3"], "foo2": ["1", "2", "3"]} # Arrays have unequal lengths -> invalid invalid_category_dict = { "foo1": ["1", "2", "3"], "foo2": ["1", "2", "3", "4"] } pdbx_file = pdbx.PDBxFile() pdbx_file.set_category("test", valid_category_dict, block="test_block") with pytest.raises(ValueError): pdbx_file.set_category("test", invalid_category_dict, block="test_block")
def test_pdbx_consistency(path, single_model): model = None if single_model else 1 cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file)
def test_PDBx_consistency(format): pdbx_file = pdbx.PDBxFile() pdbx_file.read(join(data_dir, "1l2y.cif")) array1 = pdbx.get_structure(pdbx_file) template = pdbx.get_structure(pdbx_file, model=1) if format == "trr": traj_file = trr.TRRFile() traj_file.read(join(data_dir, "1l2y.trr")) if format == "xtc": traj_file = xtc.XTCFile() traj_file.read(join(data_dir, "1l2y.xtc")) array2 = traj_file.get_structure(template) for cat in array1. get_annotation_categories(): assert array1.get_annotation(cat).tolist() == \ array2.get_annotation(cat).tolist() assert array1.coord == pytest.approx(array2.coord)
def test_empty_values(string, use_array): """ Test whether empty strings for field values are properly replaced by ``'.'``. """ LENGTH = 10 ref_value = np.full(LENGTH, string, dtype="U1") if use_array else "" pdbx_file = pdbx.PDBxFile() pdbx_file.set_category(category="test_category", block="test", category_dict={"test_field": ref_value}) test_value = pdbx_file["test_category"]["test_field"] if use_array: assert test_value.tolist() == ["."] * LENGTH else: assert test_value == "."
def test_list_assemblies(): """ Test the :func:`list_assemblies()` function based on a known example. """ path = join(data_dir, "1f2n.cif") pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) assembly_list = pdbx.list_assemblies(pdbx_file) assert assembly_list == { "1": "complete icosahedral assembly", "2": "icosahedral asymmetric unit", "3": "icosahedral pentamer", "4": "icosahedral 23 hexamer", "5": "icosahedral asymmetric unit, std point frame", "6": "crystal asymmetric unit, crystal frame" }
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile() file.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def test_pdbx_consistency(path, single_model): model = None if single_model else 1 cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) # Sometimes mmCIF files can have 'cell' entry # but corresponding MMTF file has not 'unitCell' entry # -> Do not assert for dummy entry in mmCIF file # (all vector elements = {0, 1}) if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all(): assert np.allclose(a1.box, a2.box) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_superimposition_stack(ca_only): path = join(data_dir, "1l2y.cif") pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) stack = pdbx.get_structure(pdbx_file) fixed = stack[0] mobile = stack[1:] if ca_only: mask = (mobile.atom_name == "CA") else: mask = None fitted, transformation = struc.superimpose(fixed, mobile, mask) if ca_only: # The superimpositions are better for most cases than the # superimpositions in the structure file # -> Use average assert np.mean(struc.rmsd(fixed, fitted)) \ < np.mean(struc.rmsd(fixed, mobile)) else: # The superimpositions are better than the superimpositions # in the structure file assert (struc.rmsd(fixed, fitted) < struc.rmsd(fixed, mobile)).all()
def create_bond_dict(components_pdbx_file_path, msgpack_file_path): pdbx_file = pdbx.PDBxFile() pdbx_file.read(components_pdbx_file_path) components = pdbx_file.get_block_names() bond_dict = {} for i, component in enumerate(components): print(f"{component:3} {int(i/len(components)*100):>3d}%", end="\r") cif_bonds = pdbx_file.get_category("chem_comp_bond", block=component, expect_looped=True) if cif_bonds is None: # No bond info for this compound continue else: group_bonds = {} for atom1, atom2, order, aromatic_flag in zip( cif_bonds["atom_id_1"], cif_bonds["atom_id_2"], cif_bonds["value_order"], cif_bonds["pdbx_aromatic_flag"]): bond_type = BOND_ORDERS[order, aromatic_flag] group_bonds[(atom1, atom2)] = bond_type bond_dict[component] = group_bonds with open(msgpack_file_path, "wb") as msgpack_file: msgpack.dump(bond_dict, msgpack_file)
ku_file = biotite.temp_file("ku.cif") # Download and parse structure files file = rcsb.fetch("1JEY", "mmtf", biotite.temp_dir()) ku_dna = strucio.load_structure(file) file = rcsb.fetch("1JEQ", "mmtf", biotite.temp_dir()) ku = strucio.load_structure(file) # Remove DNA and water ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")] ku_dna = ku_dna[~struc.filter_solvent(ku_dna)] ku = ku[~struc.filter_solvent(ku)] # The structures have a differing amount of atoms missing # at the the start and end of the structure # -> Find common structure ku_dna_common = ku_dna[struc.filter_intersection(ku_dna, ku)] ku_common = ku[struc.filter_intersection(ku, ku_dna)] # Superimpose ku_superimposed, transformation = struc.superimpose( ku_dna_common, ku_common, (ku_common.atom_name == "CA")) # We do not want the cropped structures # -> apply superimposition on structures before intersection filtering ku_superimposed = struc.superimpose_apply(ku, transformation) # Write PDBx files as input for PyMOL cif_file = pdbx.PDBxFile() pdbx.set_structure(cif_file, ku_dna, data_block="ku_dna") cif_file.write(ku_dna_file) cif_file = pdbx.PDBxFile() pdbx.set_structure(cif_file, ku_superimposed, data_block="ku") cif_file.write(ku_file) # Visualization with PyMOL... # biotite_static_image = ku_superimposition.png
def create_residue_dict(components_pdbx_file_path, msgpack_file_path): pdbx_file = pdbx.PDBxFile() pdbx_file.read(components_pdbx_file_path) components = pdbx_file.get_block_names() residue_dict = {} for i, component in enumerate(components): print(f"{component:3} {int(i/len(components)*100):>3d}%", end="\r") try: # Some entries use invalid quotation for the component name cif_general = pdbx_file.get_category("chem_comp", block=component) except ValueError: cif_general = None cif_atoms = pdbx_file.get_category("chem_comp_atom", block=component, expect_looped=True) cif_bonds = pdbx_file.get_category("chem_comp_bond", block=component, expect_looped=True) if cif_atoms is None: continue array = struc.AtomArray(len(list(cif_atoms.values())[0])) array.res_name = cif_atoms["comp_id"] array.atom_name = cif_atoms["atom_id"] array.element = cif_atoms["type_symbol"] array.add_annotation("charge", int) array.charge = np.array( [int(c) if c != "?" else 0 for c in cif_atoms["charge"]]) if cif_general is None: array.hetero[:] = True else: array.hetero[:] = True if cif_general["type"] == "NON-POLYMER" \ else False # For some entries only 'model_Cartn', # for some entries only 'pdbx_model_Cartn_ideal' and # for some entries none of them is defined try: array.coord[:, 0] = cif_atoms["pdbx_model_Cartn_x_ideal"] array.coord[:, 1] = cif_atoms["pdbx_model_Cartn_y_ideal"] array.coord[:, 2] = cif_atoms["pdbx_model_Cartn_z_ideal"] except (KeyError, ValueError): try: array.coord[:, 0] = cif_atoms["model_Cartn_x"] array.coord[:, 1] = cif_atoms["model_Cartn_y"] array.coord[:, 2] = cif_atoms["model_Cartn_z"] except (KeyError, ValueError): # If none of them is defined, skip this component continue bonds = struc.BondList(array.array_length()) if cif_bonds is not None: for atom1, atom2, order, aromatic_flag in zip( cif_bonds["atom_id_1"], cif_bonds["atom_id_2"], cif_bonds["value_order"], cif_bonds["pdbx_aromatic_flag"]): atom_i = np.where(array.atom_name == atom1)[0][0] atom_j = np.where(array.atom_name == atom2)[0][0] bond_type = BOND_ORDERS[order, aromatic_flag] bonds.add_bond(atom_i, atom_j, bond_type) array.bonds = bonds residue_dict[component] = array_to_dict(array) with open(msgpack_file_path, "wb") as msgpack_file: msgpack.dump(residue_dict, msgpack_file)
capsid from *Paramecium bursaria Chlorella virus type 1* - a h**o-5040-mer! At first we will check, which assemblies are available to us. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import numpy as np import biotite.structure as struc import biotite.structure.io.pdbx as pdbx import biotite.structure.io as strucio import biotite.database.rcsb as rcsb pdbx_file = pdbx.PDBxFile() pdbx_file.read(rcsb.fetch("1M4X", "mmcif")) assemblies = pdbx.list_assemblies(pdbx_file) print("ID name") print() for assembly_id, name in assemblies.items(): print(f"{assembly_id:2} {name}") ######################################################################## # ``'complete icosahedral assembly'`` sounds good. # In fact, often the first assembly is the complete one. # Hence, the :func:`get_assembly()` function builds the first assembly # by default. # Since we know the ID we want (``'1'``), we will provide it to this # function anyway.
# modern PDBx/mmCIF format in favor of the PDB format. # It solves limitations of the PDB format, that arise from the column # restrictions. # Furthermore, much more additional information is stored in these # files. # # .. currentmodule:: biotite.structure.io.pdbx # # In contrast to PDB files, *Biotite* can read the entire content of # PDBx/mmCIF files, which can be accessed in a dictionary like manner. # At first, we read the file similarily to before, but this time we # use the :class:`PDBxFile` class. import biotite.structure.io.pdbx as pdbx cif_file_path = rcsb.fetch("1l2y", "cif", biotite.temp_dir()) file = pdbx.PDBxFile() file.read(cif_file_path) ######################################################################## # Now we can access the data like a dictionary of dictionaries. print(file["1L2Y", "audit_author"]["name"]) ######################################################################## # The first index contains the data block and the category name. # The data block could be omitted, since there is only one block in the # file. # This returns a dictionary. # If the category is in a *loop*, the dictionary contains `ndarrays` # of strings as values, otherwise the dictionary contains strings # directly.