def from_list(cls, reslist, cif_path, parent_entry, annotate=True): """Construct PdbSite object directly from residue list""" mmcif_dict = dict() # First reduce redundant residues with multiple function locations reslist = PdbSite._cleanup_list(reslist) site = cls() site.parent_entry = parent_entry try: if annotate: parser = MMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) mmcif_dict = parser._mmcif_dict else: parser = FastMMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) except (TypeError, PDBConstructionException): warnings.warn( 'Could not build site from residue list. Check entry', RuntimeWarning) return for res in reslist: if structure: res.add_structure(structure) site.add(res) if annotate: site.parent_structure = structure site.mmcif_dict = mmcif_dict site.find_ligands() return site
def test_filehandle(self): """Test if the parser can handle file handle as well as filename""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual(len(structure), 1) structure = parser.get_structure("example", open("PDB/1A8O.cif")) self.assertEqual(len(structure), 1)
def test_filehandle(self): """Test if the parser can handle file handle as well as filename.""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual(len(structure), 1) structure = parser.get_structure("example", open("PDB/1A8O.cif")) self.assertEqual(len(structure), 1)
def testModels(self): """Test file with multiple models.""" parser = MMCIFParser(QUIET=1) f_parser = FastMMCIFParser(QUIET=1) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = parser.get_structure("example", "PDB/1LCD.cif") f_structure = f_parser.get_structure("example", "PDB/1LCD.cif") self.assertEqual(len(structure), 3) self.assertEqual(len(f_structure), 3) for ppbuild in [PPBuilder(), CaPPBuilder()]: # ========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) self.assertEqual(structure[1].serial_num, 2) self.assertEqual(structure[2].serial_num, 3) # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertIsInstance(s, Seq) self.assertEqual(s.alphabet, generic_protein) # Here non-standard MSE are shown as M self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s) ) # ========================================================== # Now try strict version with only standard amino acids polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertIsInstance(s, Seq) self.assertEqual(s.alphabet, generic_protein) self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s) ) # This structure contains several models with multiple lengths. # The tests were failing. structure = parser.get_structure("example", "PDB/2OFG.cif") self.assertEqual(len(structure), 3)
def testModels(self): """Test file with multiple models""" parser = MMCIFParser(QUIET=1) f_parser = FastMMCIFParser(QUIET=1) with warnings.catch_warnings(): warnings.simplefilter('ignore', PDBConstructionWarning) structure = parser.get_structure("example", "PDB/1LCD.cif") f_structure = f_parser.get_structure("example", "PDB/1LCD.cif") self.assertEqual(len(structure), 3) self.assertEqual(len(f_structure), 3) for ppbuild in [PPBuilder(), CaPPBuilder()]: # ========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) self.assertEqual(structure[1].serial_num, 2) self.assertEqual(structure[2].serial_num, 3) # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) # Here non-standard MSE are shown as M self.assertEqual("MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s)) # ========================================================== # Now try strict version with only standard amino acids polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s)) # This structure contains several models with multiple lengths. # The tests were failing. structure = parser.get_structure("example", "PDB/2OFG.cif") self.assertEqual(len(structure), 3)
def test_insertions(self): """Test file with residue insertion codes""" parser = MMCIFParser(QUIET=1) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = parser.get_structure("example", "PDB/4ZHL.cif") for ppbuild in [PPBuilder(), CaPPBuilder()]: # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 2) pp = polypeptides[0] # Check the start and end positions (first segment only) self.assertEqual(pp[0].get_id()[1], 16) self.assertEqual(pp[-1].get_id()[1], 244) # Check the sequence refseq = ( "IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATHCFIDYPKKEDYIVYLGR" "SRLNSNTQGEMKFEVENLILHKDYSADTLAYHNDIALLKIRSKEGRCAQPSRTIQTIALPSMY" "NDPQFGTSCEITGFGKEQSTDYLYPEQLKMTVVKLISHRECQQPHYYGSEVTTKMLCAADPQW" "KTDSCQGDSGGPLVCSLQGRMTLTGIVSWGRGCALKDKPGVYTRVSHFLPWIRSHTKE" ) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual(refseq, str(s))
def CIF2PDB(ciffile, pdbfile, verbose=False): #Not sure why biopython needs this to read a cif file strucid = ciffile[:4] if len(ciffile) > 4 else "1xxx" # Read file parser = MMCIFParser() structure = parser.get_structure(strucid, ciffile) # rename long chains try: chainmap = rename_chains(structure) except OutOfChainsError: logging.error("Too many chains to represent in PDB format") sys.exit(1) if verbose: for new, old in chainmap.items(): if new != old: logging.info("Renaming chain {0} to {1}".format(old, new)) #Write PDB io = PDBIO() io.set_structure(structure) #TODO What happens with large structures? io.save(pdbfile) return pdbfile
def get_info_mmcif(file): parser = MMCIFParser() structure = parser.get_structure(file.split('.')[0], file) coord_ca = {} bary = {} for chain in structure[0]: coord_ca[chain] = [] bary[chain] = 0 for residue in chain: if residue.has_id('CA'): coord_ca[chain].append(residue['CA'].get_coord()) else: coord_moy = [0, 0, 0] for atom in residue: coord_at = atom.get_coord() coord_moy = [coord_at[i] / len(residue) for i in range(3)] coord_ca[chain].append(coord_moy) coord_ca[chain] = np.asarray(coord_ca[chain]) bary[chain] = np.array([np.mean(coord_ca[chain][i]) for i in range(3)]) enf = {} for chain in structure[0]: enf[chain] = [] for coord in coord_ca[chain]: enf[chain].append(np.linalg.norm(coord - bary[chain])) #ppb = PPBuilder() #seqpdb = ppb.build_peptides(chain)[0].get_sequence() return bary, enf
def test_insertions(self): """Test file with residue insertion codes.""" parser = MMCIFParser(QUIET=1) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = parser.get_structure("example", "PDB/4ZHL.cif") for ppbuild in [PPBuilder(), CaPPBuilder()]: # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 2) pp = polypeptides[0] # Check the start and end positions (first segment only) self.assertEqual(pp[0].get_id()[1], 16) self.assertEqual(pp[-1].get_id()[1], 244) # Check the sequence refseq = ( "IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATHCFIDYPKKEDYIVYLGR" "SRLNSNTQGEMKFEVENLILHKDYSADTLAYHNDIALLKIRSKEGRCAQPSRTIQTIALPSMY" "NDPQFGTSCEITGFGKEQSTDYLYPEQLKMTVVKLISHRECQQPHYYGSEVTTKMLCAADPQW" "KTDSCQGDSGGPLVCSLQGRMTLTGIVSWGRGCALKDKPGVYTRVSHFLPWIRSHTKE" ) s = pp.get_sequence() self.assertIsInstance(s, Seq) self.assertEqual(s.alphabet, generic_protein) self.assertEqual(refseq, str(s))
def check_mmtf_vs_cif(self, mmtf_filename, cif_filename): """Compare parsed structures for MMTF and CIF files.""" with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) mmtf_struct = MMTFParser.get_structure(mmtf_filename) mmcif_parser = MMCIFParser() mmcif_struct = mmcif_parser.get_structure("4CUP", cif_filename) self.mmcif_atoms = list(mmcif_struct.get_atoms()) self.mmtf_atoms = list(mmtf_struct.get_atoms()) self.check_atoms() mmcif_chains = list(mmcif_struct.get_chains()) mmtf_chains = list(mmtf_struct.get_chains()) self.assertEqual(len(mmcif_chains), len(mmtf_chains)) for i, e in enumerate(mmcif_chains): self.mmcif_res = list(mmcif_chains[i].get_residues()) self.mmtf_res = list(mmtf_chains[i].get_residues()) self.check_residues() self.mmcif_res = list(mmcif_struct.get_residues()) self.mmtf_res = list(mmtf_struct.get_residues()) self.check_residues() self.assertEqual( sum(1 for _ in mmcif_struct.get_models()), sum(1 for _ in mmtf_struct.get_models()), )
def test_mmtf(self): """Parse mmCIF file.""" with warnings.catch_warnings(): mmcif_parser = MMCIFParser() warnings.simplefilter('ignore', PDBConstructionWarning) structure = mmcif_parser.get_structure("MICR", "PDB/1EJG.cif") print(structure)
def test_with_anisotrop(self): parser = MMCIFParser() fast_parser = FastMMCIFParser() structure = parser.get_structure("example", "PDB/4CUP.cif") f_structure = fast_parser.get_structure("example", "PDB/4CUP.cif") self.assertEqual(len(structure), 1) self.assertEqual(len(f_structure), 1) s_atoms = list(structure.get_atoms()) f_atoms = list(f_structure.get_atoms()) self.assertEqual(len(s_atoms), len(f_atoms)) for atoms in [s_atoms, f_atoms]: atom_names = ['N', 'CA', 'C', 'O', 'CB'] self.assertSequenceEqual([a.get_name() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_id() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_fullname() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_occupancy() for a in atoms[:5]], [1., 1., 1., 1., 1.]) self.assertIsInstance(atoms[0].get_coord(), numpy.ndarray) coord = numpy.array([50.346, 19.287, 17.288], dtype=numpy.float32) numpy.testing.assert_array_equal(atoms[0].get_coord(), coord) self.assertEqual(atoms[0].get_bfactor(), 32.02) ansiou = numpy.array([0.4738, -0.0309, -0.0231, 0.4524, 0.0036, 0.2904], dtype=numpy.float32) numpy.testing.assert_array_equal(atoms[0].get_anisou(), ansiou) ansiou = numpy.array([1.1242, 0.2942, -0.0995, 1.1240, -0.1088, 0.8221], dtype=numpy.float32) atom_937 = list(f_structure[0]['A'])[114]['CB'] numpy.testing.assert_array_equal(atom_937.get_anisou(), ansiou)
def test_conversion(self): """Parse 1A8O.cif, write 1A8O.pdb, parse again and compare""" cif_parser = MMCIFParser(QUIET=1) cif_struct = cif_parser.get_structure("example", "PDB/1LCD.cif") pdb_writer = PDBIO() pdb_writer.set_structure(cif_struct) filenumber, filename = tempfile.mkstemp() pdb_writer.save(filename) pdb_parser = PDBParser(QUIET=1) pdb_struct = pdb_parser.get_structure('example_pdb', filename) # comparisons self.assertEqual(len(pdb_struct), len(cif_struct)) pdb_atom_names = [a.name for a in pdb_struct.get_atoms()] cif_atom_names = [a.name for a in cif_struct.get_atoms()] self.assertEqual(len(pdb_atom_names), len(cif_atom_names)) self.assertSequenceEqual(pdb_atom_names, cif_atom_names) pdb_atom_elems = [a.element for a in pdb_struct.get_atoms()] cif_atom_elems = [a.element for a in cif_struct.get_atoms()] self.assertSequenceEqual(pdb_atom_elems, cif_atom_elems)
def get_structure(self, *args): if len(args) == 2: pdbId, fileName = args elif len(args) == 1: fileName = args[0] pdbId, fileName = str(fileName), fileName else: raise ValueError( "Error, input should be (id, fileName) or (fileName))") if re.match("http(s?)://", fileName): r = requests.get(fileName) if r.ok: fileName = StringIO(r.text) else: raise Exception("Error downloading pdb") try: if not isinstance(fileName, str) or not fileName.endswith(".gz"): structure = PDBParser.get_structure(self, pdbId, fileName) else: with gzip.open(fileName) as f: structure = PDBParser.get_structure(self, pdbId, f) except Exception as e: print(e) structure = MMCIFParser.get_structure(self, pdbId, fileName) if self.removeHeteroDuplicated: structure = self.filterOutDuplicated(structure) return structure
def test_conversion(self): """Parse 1A8O.cif, write 1A8O.pdb, parse again and compare""" cif_parser = MMCIFParser(QUIET=1) cif_struct = cif_parser.get_structure("example", "PDB/1LCD.cif") pdb_writer = PDBIO() pdb_writer.set_structure(cif_struct) filenumber, filename = tempfile.mkstemp() pdb_writer.save(filename) pdb_parser = PDBParser(QUIET=1) pdb_struct = pdb_parser.get_structure('example_pdb', filename) # comparisons self.assertEqual(len(pdb_struct), len(cif_struct)) pdb_atom_names = [a.name for a in pdb_struct.get_atoms()] cif_atom_names = [a.name for a in pdb_struct.get_atoms()] self.assertEqual(len(pdb_atom_names), len(cif_atom_names)) self.assertSequenceEqual(pdb_atom_names, cif_atom_names) pdb_atom_elems = [a.element for a in pdb_struct.get_atoms()] cif_atom_elems = [a.element for a in pdb_struct.get_atoms()] self.assertSequenceEqual(pdb_atom_elems, cif_atom_elems)
def build_all(cls, reslist, reference_site, parent_entry, cif_path, annotate=True, redundancy_cutoff=None): """Builds all sites in using as input a list of catalytic residues. Returns a list of PdbSite objects""" # Map structure objects in every residue sites = [] mmcif_dict = dict() try: if annotate: parser = MMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) mmcif_dict = parser._mmcif_dict else: parser = FastMMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) except (TypeError, PDBConstructionException): warnings.warn('Could not parse structure {}'.format( cif_path, RuntimeWarning)) return sites # First reduce redundant residues with multiple function locations reslist = PdbSite._cleanup_list(reslist) # We want all equivalent residues from identical assembly chains reslist = PdbSite._get_assembly_residues(reslist, structure) # Get seeds to build active sites seeds = PdbSite._get_seeds(reslist) # Build a site from each seed for seed in seeds: sites.append(cls.build(seed, reslist, reference_site, parent_entry)) # Reduce redundancy sites = PdbSite._remove_redundant_sites(sites, cutoff=redundancy_cutoff) # Add ligands and annotations if annotate and structure: for site in sites: site.parent_structure = structure site.mmcif_dict = mmcif_dict site.find_ligands() # Flag unclustered sites PdbSite._mark_unclustered(sites) return sites
def test_header(self): """Test if the parser populates header data.""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/a_structure.cif") self.assertEqual("", structure.header["idcode"]) self.assertEqual("", structure.header["head"]) self.assertEqual("", structure.header["deposition_date"]) self.assertEqual("", structure.header["structure_method"]) self.assertEqual(0.0, structure.header["resolution"]) structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual("1A8O", structure.header["idcode"]) self.assertEqual("Viral protein", structure.header["head"]) self.assertEqual("", structure.header["deposition_date"]) self.assertEqual("X-RAY DIFFRACTION", structure.header["structure_method"]) self.assertEqual(1.7, structure.header["resolution"])
def __init__(self, path): ''' Initialize every PDB_Parser with a path to a structure-file in CIF format. An example file is included in the repository (7ahl.cif). Tip: Store the parsed structure in an object variable instead of parsing it again & again ... ''' parser = MMCIFParser() self.structure = parser.get_structure('PHA-L', path)
def __init__( self, path ): ''' Initialize every PDB_Parser with a path to a structure-file in CIF format. An example file is included in the repository (7ahl.cif). Tip: Store the parsed structure in an object variable instead of parsing it again & again ... ''' CIF_PARSER = MMCIFParser() self.structure = CIF_PARSER.get_structure('PHA-L',path) # Parse the structure once and re-use it in the functions below
def test_conversion_not_preserve_numbering(self): """Convert mmCIF to PDB and renumber atom serials.""" cif_parser = MMCIFParser(QUIET=1) cif_struct = cif_parser.get_structure("example", "PDB/a_structure.cif") pdb_writer = PDBIO() pdb_writer.set_structure(cif_struct) filenumber, filename = tempfile.mkstemp() pdb_writer.save(filename, preserve_atom_numbering=False)
def __init__(self, path): """ Initialize every PDB_Parser with a path to a structure-file in CIF format. An example file is included in the repository (7ahl.cif). Tip: Store the parsed structure in an object variable instead of parsing it again & again ... """ # parser object for reading in structure in CIF format parser = MMCIFParser() # Parse the structure once and re-use it in the functions below self.structure = parser.get_structure('some structure string here, e.g. 7AHL', path)
def test_conversion_preserve_numbering(self): """Convert mmCIF to PDB and preserve original serial numbering.""" cif_parser = MMCIFParser(QUIET=1) cif_struct = cif_parser.get_structure("example", "PDB/a_structure.cif") pdb_writer = PDBIO() pdb_writer.set_structure(cif_struct) filenumber, filename = tempfile.mkstemp() with self.assertRaises(ValueError): pdb_writer.save(filename, preserve_atom_numbering=True)
def test_compare_to_mmcif(self): """Compre the MMTF and mmCIF parsed structrues""" def test_atoms(parse_mmtf): """Test that all atoms in self.mmtf_atoms and self.mmcif_atoms are equivalent""" parse_mmtf.assertEqual(len(parse_mmtf.mmcif_atoms), len(parse_mmtf.mmtf_atoms)) for i, e in enumerate(parse_mmtf.mmcif_atoms): mmtf_atom = parse_mmtf.mmtf_atoms[i] mmcif_atom = parse_mmtf.mmcif_atoms[i] parse_mmtf.assertEqual(mmtf_atom.name, mmcif_atom.name) # eg. CA, spaces are removed from atom name parse_mmtf.assertEqual(mmtf_atom.fullname, mmcif_atom.fullname) # e.g. " CA ", spaces included parse_mmtf.assertAlmostEqual(mmtf_atom.coord[0], mmcif_atom.coord[0], places=3) parse_mmtf.assertAlmostEqual(mmtf_atom.coord[1], mmcif_atom.coord[1], places=3) parse_mmtf.assertAlmostEqual(mmtf_atom.coord[2], mmcif_atom.coord[2], places=3) parse_mmtf.assertEqual(mmtf_atom.bfactor, mmcif_atom.bfactor) parse_mmtf.assertEqual(mmtf_atom.occupancy, mmcif_atom.occupancy) parse_mmtf.assertEqual(mmtf_atom.altloc, mmcif_atom.altloc) parse_mmtf.assertEqual(mmtf_atom.full_id, mmcif_atom.full_id) # (structure id, model id, chain id, residue id, atom id) parse_mmtf.assertEqual(mmtf_atom.id, mmcif_atom.name) # id of atom is the atom name (e.g. "CA") # self.assertEqual(mmtf_atom.serial_number,mmcif_atom.serial_number) # mmCIF serial number is none def test_residues(parse_mmtf): """Test that all residues in self.mmcif_res and self.mmtf_res are equivalent""" parse_mmtf.assertEqual(len(parse_mmtf.mmcif_res), len(parse_mmtf.mmtf_res)) for i, e in enumerate(parse_mmtf.mmcif_res): mmcif_r = parse_mmtf.mmcif_res[i] mmtf_r = parse_mmtf.mmtf_res[i] parse_mmtf.assertEqual(mmtf_r.level, mmcif_r.level) parse_mmtf.assertEqual(mmtf_r.disordered, mmcif_r.disordered) parse_mmtf.assertEqual(mmtf_r.resname, mmcif_r.resname) parse_mmtf.assertEqual(mmtf_r.segid, mmcif_r.segid) parse_mmtf.mmcif_atoms = [x for x in mmcif_r.get_atom()] parse_mmtf.mmtf_atoms = [x for x in mmtf_r.get_atom()] test_atoms(parse_mmtf=parse_mmtf) with warnings.catch_warnings(): warnings.simplefilter('ignore', PDBConstructionWarning) mmtf_struct = MMTFParser.get_structure("PDB/4CUP.mmtf") mmcif_parser = MMCIFParser() mmcif_struct = mmcif_parser.get_structure("example", "PDB/4CUP.cif") self.mmcif_atoms = [x for x in mmcif_struct.get_atoms()] self.mmtf_atoms = [x for x in mmtf_struct.get_atoms()] test_atoms(self) mmcif_chains = [x for x in mmcif_struct.get_chains()] mmtf_chains = [x for x in mmtf_struct.get_chains()] self.assertEqual(len(mmcif_chains), len(mmtf_chains)) for i, e in enumerate(mmcif_chains): self.mmcif_res = [x for x in mmcif_chains[i].get_residues()] self.mmtf_res = [x for x in mmtf_chains[i].get_residues()] test_residues(self) self.mmcif_res = [x for x in mmcif_struct.get_residues()] self.mmtf_res = [x for x in mmtf_struct.get_residues()] test_residues(self) self.assertEqual(len([x for x in mmcif_struct.get_models()]), len([x for x in mmtf_struct.get_models()]))
def __init__( self, path ): ''' Initialize every PDB_Parser with a path to a structure-file in CIF format. An example file is included in the repository (7ahl.cif). Tip: Store the parsed structure in an object variable instead of parsing it again & again ... ''' CIF_PARSER = MMCIFParser() # parser object for reading in structure in CIF format i=0 self.structure = CIF_PARSER.get_structure("Structure",path) # Parse the structure once and re-use it in the functions below print(self.get_number_of_water_molecules("D"))
def __init__(self, path): ''' Initialize every PDB_Parser with a path to a structure-file in CIF format. An example file is included in the repository (7ahl.cif). Tip: Store the parsed structure in an object variable instead of parsing it again & again ... ''' cif_parser = MMCIFParser(QUIET=True) # parser object for reading in structure in CIF format self.structure = cif_parser.get_structure('structure', path) self.model = self.structure[0] self.residue_dict = {k.upper(): v for d in [protein_letters_3to1, {'HOH': ''}] for k, v in d.items()}
def clean_pdb(file_input, file_output, chain_to_keep, parameters): parser = MMCIFParser() structure = parser.get_structure(file_input[:-4].upper(), file_input) structure = remove_chains(structure, chain_to_keep) structure = remove_extra_atoms(structure, parameters) io = MMCIFIO() io.set_structure(structure) io.save(file_output)
def test_parser(self): """Extract polypeptides from 1A80.""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual(len(structure), 1) for ppbuild in [PPBuilder(), CaPPBuilder()]: # ========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 151) self.assertEqual(pp[-1].get_id()[1], 220) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) # Here non-standard MSE are shown as M self.assertEqual( "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ" "NANPDCKTILKALGPGATLEEMMTACQG", str(s)) # ========================================================== # Now try strict version with only standard amino acids # Should ignore MSE 151 at start, and then break the chain # at MSE 185, and MSE 214,215 polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 3) # First fragment pp = polypeptides[0] self.assertEqual(pp[0].get_id()[1], 152) self.assertEqual(pp[-1].get_id()[1], 184) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s)) # Second fragment pp = polypeptides[1] self.assertEqual(pp[0].get_id()[1], 186) self.assertEqual(pp[-1].get_id()[1], 213) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s)) # Third fragment pp = polypeptides[2] self.assertEqual(pp[0].get_id()[1], 216) self.assertEqual(pp[-1].get_id()[1], 220) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TACQG", str(s))
def test_parser(self): """Extract polypeptides from 1A80.""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual(len(structure), 1) for ppbuild in [PPBuilder(), CaPPBuilder()]: #========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) #First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 151) self.assertEqual(pp[-1].get_id()[1], 220) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) #Here non-standard MSE are shown as M self.assertEqual("MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ" "NANPDCKTILKALGPGATLEEMMTACQG", str(s)) #========================================================== #Now try strict version with only standard amino acids #Should ignore MSE 151 at start, and then break the chain #at MSE 185, and MSE 214,215 polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 3) #First fragment pp = polypeptides[0] self.assertEqual(pp[0].get_id()[1], 152) self.assertEqual(pp[-1].get_id()[1], 184) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s)) #Second fragment pp = polypeptides[1] self.assertEqual(pp[0].get_id()[1], 186) self.assertEqual(pp[-1].get_id()[1], 213) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s)) #Third fragment pp = polypeptides[2] self.assertEqual(pp[0].get_id()[1], 216) self.assertEqual(pp[-1].get_id()[1], 220) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TACQG", str(s))
def get_atoms(file): parser = MMCIFParser() structure = parser.get_structure(file.split('.')[0], file) pos = [] model = structure[0] for chain in model: pos_c = [] for residue in chain: if residue.has_id('CA'): vca = residue['CA'].get_vector() pos_c.append((residue.get_resname(), vca)) pos.append(pos_c) return pos
def test_header(self): """Test if the parser populates header data.""" parser = MMCIFParser(QUIET=1) # test default values structure = parser.get_structure("example", "PDB/a_structure.cif") self.assertEqual("", structure.header["idcode"]) self.assertEqual("", structure.header["head"]) self.assertEqual("", structure.header["deposition_date"]) self.assertEqual("", structure.header["structure_method"]) self.assertIsNone(structure.header["resolution"]) # test extracting fields structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual("1A8O", structure.header["idcode"]) self.assertEqual("Viral protein", structure.header["head"]) self.assertEqual("", structure.header["deposition_date"]) self.assertEqual("X-RAY DIFFRACTION", structure.header["structure_method"]) self.assertEqual(1.7, structure.header["resolution"]) # test not confused by '.' structure = parser.get_structure("example", "PDB/1SSU_mod.cif") self.assertIsNone(structure.header["resolution"])
def setUp(self): # Silence! warnings.simplefilter('ignore', PDBConstructionWarning) pdbparser = PDBParser(QUIET=1) cifparser = MMCIFParser(QUIET=1) modpath = os.path.abspath(os.path.dirname(__file__)) pdb_file = os.path.join(modpath, "PDB", "1LCD.pdb") cif_file = os.path.join(modpath, "PDB", "1LCD.cif") self.pdbo = pdbparser.get_structure('pdb', pdb_file) self.cifo = cifparser.get_structure('pdb', cif_file)
def setUp(self): # Silence! warnings.simplefilter("ignore", PDBConstructionWarning) pdbparser = PDBParser(QUIET=1) cifparser = MMCIFParser(QUIET=1) modpath = os.path.abspath(os.path.dirname(__file__)) pdb_file = os.path.join(modpath, "PDB", "1LCD.pdb") cif_file = os.path.join(modpath, "PDB", "1LCD.cif") self.pdbo = pdbparser.get_structure("pdb", pdb_file) self.cifo = cifparser.get_structure("pdb", cif_file)
def test_write(self): """Test a simple structure object is written out correctly to MMTF.""" parser = MMCIFParser() struc = parser.get_structure("1A8O", "PDB/1A8O.cif") io = MMTFIO() io.set_structure(struc) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: io.save(filename) struc_back = MMTFParser.get_structure(filename) dict_back = mmtf.parse(filename) self.assertEqual(dict_back.structure_id, "1A8O") self.assertEqual(dict_back.num_models, 1) self.assertEqual(dict_back.num_chains, 2) self.assertEqual(dict_back.num_groups, 158) self.assertEqual(dict_back.num_atoms, 644) self.assertEqual(len(dict_back.x_coord_list), 644) self.assertEqual(len(dict_back.y_coord_list), 644) self.assertEqual(len(dict_back.z_coord_list), 644) self.assertEqual(len(dict_back.b_factor_list), 644) self.assertEqual(len(dict_back.occupancy_list), 644) self.assertEqual(dict_back.x_coord_list[5], 20.022) self.assertEqual(set(dict_back.ins_code_list), {"\x00"}) self.assertEqual(set(dict_back.alt_loc_list), {"\x00"}) self.assertEqual(list(dict_back.atom_id_list), list(range(1, 645))) self.assertEqual(list(dict_back.sequence_index_list), list(range(70)) + [-1] * 88) self.assertEqual(dict_back.chain_id_list, ["A", "B"]) self.assertEqual(dict_back.chain_name_list, ["A", "A"]) self.assertEqual(dict_back.chains_per_model, [2]) self.assertEqual(len(dict_back.group_list), 21) self.assertEqual(len(dict_back.group_id_list), 158) self.assertEqual(len(dict_back.group_type_list), 158) self.assertEqual(dict_back.groups_per_chain, [70, 88]) self.assertEqual(len(dict_back.entity_list), 2) self.assertEqual(dict_back.entity_list[0]["type"], "polymer") self.assertEqual(dict_back.entity_list[0]["chainIndexList"], [0]) self.assertEqual( dict_back.entity_list[0]["sequence"], "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG", ) self.assertEqual(dict_back.entity_list[1]["type"], "water") self.assertEqual(dict_back.entity_list[1]["chainIndexList"], [1]) self.assertEqual(dict_back.entity_list[1]["sequence"], "") finally: os.remove(filename)
def get_descriptors(file): parser = MMCIFParser() structure = parser.get_structure(file.split('.')[0], file) pos = [] model = structure[0] hse = HSExposureCB(model) for chain in model: pos_c = [] for residue in chain: dic = {} dic["name"] = residue.get_resname() if residue.has_id('CA'): vca = residue['CA'].get_vector() dic["coord"] = vca hse_ = hse[(chain.id, residue.id)] dic["hse"] = (hse_[0], hse_[1]) pos_c.append(dic) pos = pos + pos_c return pos
def func1(): import sys import re import gzip from Bio.PDB.MMCIFParser import MMCIFParser parser = MMCIFParser(QUIET=True) from Bio.PDB.PDBParser import PDBParser parser1 = PDBParser(PERMISSIVE=0, QUIET=True) from Bio.PDB.PDBIO import PDBIO #pathmmcif = "/Users/tarun/Documents/mmCIF" #pathmmcif = "/data/pdb/divided/mmCIF" pathmmcif = "/Volumes/BIOINFO/mmCIF" #pathmmcif = "/Volumes/RCSB_DATA/pdb" #count = 0 #if count == 0: try: pdb1 = "{}".format(sys.argv[2]) fol = pdb1[1:3] c1 = "{}".format(sys.argv[3]) pdbfile = "{}/{}/{}.cif.gz".format(pathmmcif, fol, pdb1) #pdbfile = "{}/{}/pdb{}.ent.gz".format(pathmmcif,fol,pdb1) tar = gzip.open("{}".format(pdbfile), "rb") out = open("pdbprocess.cif", "wb") #out = open("pdbprocess.pdb","wb") out.write(tar.read()) tar.close() out.close() structure_id = "{}".format(pdb1) filename = "pdbprocess.cif" #filename = "pdbprocess.pdb" structure = parser.get_structure(structure_id, filename) model = structure[0] chain = model["{}".format(c1)] io = PDBIO() io.set_structure(chain) io.save("chain1.pdb") except: print("FILE NOT FOUND")
def check_mmtf_vs_cif(self, mmtf_filename, cif_filename): """Compare parsed structures for MMTF and CIF files.""" with warnings.catch_warnings(): warnings.simplefilter('ignore', PDBConstructionWarning) mmtf_struct = MMTFParser.get_structure(mmtf_filename) mmcif_parser = MMCIFParser() mmcif_struct = mmcif_parser.get_structure("example", cif_filename) self.mmcif_atoms = [x for x in mmcif_struct.get_atoms()] self.mmtf_atoms = [x for x in mmtf_struct.get_atoms()] self.check_atoms() mmcif_chains = [x for x in mmcif_struct.get_chains()] mmtf_chains = [x for x in mmtf_struct.get_chains()] self.assertEqual(len(mmcif_chains), len(mmtf_chains)) for i, e in enumerate(mmcif_chains): self.mmcif_res = [x for x in mmcif_chains[i].get_residues()] self.mmtf_res = [x for x in mmtf_chains[i].get_residues()] self.check_residues() self.mmcif_res = [x for x in mmcif_struct.get_residues()] self.mmtf_res = [x for x in mmtf_struct.get_residues()] self.check_residues() self.assertEqual(len([x for x in mmcif_struct.get_models()]), len([x for x in mmtf_struct.get_models()]))
def testModels(self): """Test file with multiple models""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1LCD.cif") self.assertEqual(len(structure), 3) for ppbuild in [PPBuilder(), CaPPBuilder()]: #========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) self.assertEqual(structure[1].serial_num, 2) self.assertEqual(structure[2].serial_num, 3) #First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) #Here non-standard MSE are shown as M self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s)) #========================================================== #Now try strict version with only standard amino acids polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s))
def test_with_anisotrop(self): parser = MMCIFParser() fast_parser = FastMMCIFParser() structure = parser.get_structure("example", "PDB/4CUP.cif") f_structure = fast_parser.get_structure("example", "PDB/4CUP.cif") self.assertEqual(len(structure), 1) self.assertEqual(len(f_structure), 1) s_atoms = list(structure.get_atoms()) f_atoms = list(f_structure.get_atoms()) self.assertEqual(len(s_atoms), len(f_atoms)) for atoms in [s_atoms, f_atoms]: atom_names = ['N', 'CA', 'C', 'O', 'CB'] self.assertSequenceEqual([a.get_name() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_id() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_fullname() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_occupancy() for a in atoms[:5]], [1., 1., 1., 1., 1.]) self.assertIsInstance(atoms[0].get_coord(), numpy.ndarray) coord = numpy.array([50.346, 19.287, 17.288], dtype=numpy.float32) numpy.testing.assert_array_equal(atoms[0].get_coord(), coord) self.assertEqual(atoms[0].get_bfactor(), 32.02) ansiou = numpy.array( [0.4738, -0.0309, -0.0231, 0.4524, 0.0036, 0.2904], dtype=numpy.float32) numpy.testing.assert_array_equal(atoms[0].get_anisou(), ansiou) ansiou = numpy.array( [1.1242, 0.2942, -0.0995, 1.1240, -0.1088, 0.8221], dtype=numpy.float32) atom_937 = list(f_structure[0]['A'])[114]['CB'] numpy.testing.assert_array_equal(atom_937.get_anisou(), ansiou)
def process_monomer_data(PDB_PROTEIN_PATH, PROCESSED_DATA_PATH): """check if pdb protein is a monomer Parameters ---------- PDB_PROTEIN_PATH : str location of the directory with full pdb files PROCESSED_DATA_PATH : str output file """ protein_files = [ f for f in listdir(PDB_PROTEIN_PATH) if isfile(join(PDB_PROTEIN_PATH, f)) ] results = {'id': [], 'monomer': []} #for all files in directory for protein_file in protein_files: protein_id = protein_file[:-4] pdb_protein_file = PDB_PROTEIN_PATH + protein_file # if only 1 chain, pdb = monomer try: p = MMCIFParser(QUIET=1) structure = p.get_structure(protein_id, pdb_protein_file) results['id'].append(protein_id) if len(structure[0]) == 1: results['monomer'].append(True) else: results['monomer'].append(False) except: continue df = pd.DataFrame(results) df.to_csv(PROCESSED_DATA_PATH, index=False)
def get_pdb(): import gzip from Bio.PDB.MMCIFParser import MMCIFParser parser = MMCIFParser(QUIET=True) from Bio.PDB.Polypeptide import one_to_three as ott from Bio.PDB.PDBIO import PDBIO from Bio.PDB.PDBIO import Select #pathmmcif = "/Users/tarun/Documents/mmCIF" pathmmcif = "/Volumes/BIOINFO/mmCIF" pdb = sys.argv[1] C = sys.argv[2] # CHAIN fol = pdb[1:3] pdbfile = "{}/{}/{}.cif.gz".format(pathmmcif, fol, pdb) tar = gzip.open("{}".format(pdbfile), "rb") out = open("pdbprocess.cif", "wb") out.write(tar.read()) tar.close() out.close() structure_id = "{}".format(pdb) filename = "pdbprocess.cif" structure = parser.get_structure(structure_id, filename) model = structure[0] chain = model["{}".format(C)] c1 = chain.get_list() # LIST ALL THE RESIDUES io = PDBIO() io.set_structure(chain) io.save("WT.pdb")
def test_parsers(self): """Extract polypeptides from 1A80.""" parser = MMCIFParser() fast_parser = FastMMCIFParser() structure = parser.get_structure("example", "PDB/1A8O.cif") f_structure = fast_parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual(len(structure), 1) self.assertEqual(len(f_structure), 1) for ppbuild in [PPBuilder(), CaPPBuilder()]: # ========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) self.assertEqual(f_structure[0].serial_num, structure[0].serial_num) # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) f_polypeptides = ppbuild.build_peptides(f_structure[0], False) self.assertEqual(len(polypeptides), 1) self.assertEqual(len(f_polypeptides), 1) pp = polypeptides[0] f_pp = f_polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 151) self.assertEqual(pp[-1].get_id()[1], 220) self.assertEqual(f_pp[0].get_id()[1], 151) self.assertEqual(f_pp[-1].get_id()[1], 220) # Check the sequence s = pp.get_sequence() f_s = f_pp.get_sequence() self.assertEqual(s, f_s) # enough to test this self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) # Here non-standard MSE are shown as M self.assertEqual("MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ" "NANPDCKTILKALGPGATLEEMMTACQG", str(s)) # ========================================================== # Now try strict version with only standard amino acids # Should ignore MSE 151 at start, and then break the chain # at MSE 185, and MSE 214,215 polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 3) # First fragment pp = polypeptides[0] self.assertEqual(pp[0].get_id()[1], 152) self.assertEqual(pp[-1].get_id()[1], 184) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s)) # Second fragment pp = polypeptides[1] self.assertEqual(pp[0].get_id()[1], 186) self.assertEqual(pp[-1].get_id()[1], 213) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s)) # Third fragment pp = polypeptides[2] self.assertEqual(pp[0].get_id()[1], 216) self.assertEqual(pp[-1].get_id()[1], 220) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TACQG", str(s)) s_atoms = list(structure.get_atoms()) f_atoms = list(f_structure.get_atoms()) for atoms in [s_atoms, f_atoms]: self.assertEqual(len(atoms), 644) atom_names = ['N', 'CA', 'C', 'O', 'CB'] self.assertSequenceEqual([a.get_name() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_id() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_fullname() for a in atoms[:5]], atom_names) self.assertSequenceEqual([a.get_occupancy() for a in atoms[:5]], [1., 1., 1., 1., 1.]) self.assertIsInstance(atoms[0].get_coord(), numpy.ndarray) coord = numpy.array([19.594, 32.367, 28.012], dtype=numpy.float32) numpy.testing.assert_array_equal(atoms[0].get_coord(), coord) self.assertEqual(atoms[0].get_bfactor(), 18.03) for atom in atoms: self.assertIsNone(atom.get_anisou())