def test_pdb_to_xyzr(self): """Test generation of xyzr (atomic radii) file.""" with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) p = PDBParser(PERMISSIVE=1) structure = p.get_structure("example", "PDB/1A8O.pdb") # Read radii produced with original shell script with open("PDB/1A8O.xyzr") as handle: msms_radii = [] for line in handle: fields = line.split() radius = float(fields[3]) msms_radii.append(radius) model = structure[0] biopy_radii = [] for atom in model.get_atoms(): biopy_radii.append(_get_atom_radius(atom, rtype="united")) self.assertEqual(msms_radii, biopy_radii)
def aa_seq_from_coords(pdb_file): """ Gets the sequence of each protein chain from the ATOM section of a PDB file. Any residue with a coordinate will be shown in the sequence, but any :return: A dictionary mapping each protein chain to its sequence from the pdb file, for each residue that has at least one coordinate. :rtype: defaultdict(str) """ p = PDBParser() file_name = filename_no_extension(pdb_file) structure = p.get_structure(file_name, pdb_file) seqs = {} for chain in structure.get_chains(): seq3letters = ''.join( [res.get_resname() for res in chain.get_residues()]) seqs[chain.get_id()] = seq1(seq3letters, undef_code='') return seqs
def load_pdb(fileName, ident=None): """ Read PDB from file into biopython structure object Parameters ---------- fileName : str the path to the file ident : str (optional) the desired identity of the structure object Returns ------- values : :class:`paramagpy.protein.CustomStructure` a structure object containing the atomic coordinates """ if not ident: ident = fileName parser = PDBParser(structure_builder=CustomStructureBuilder()) return parser.get_structure(ident, fileName)
def GetUniqueChains(self, pdir, pdbID, chains_to_check): """ Returns a List Unique Chains based on the C-alpha atom information. Structure based, not sequence based """ e = 'pdb' + self.pdbID + '.ent' BioParser = PDBParser(PERMISSIVE=True, QUIET=True) BioStructure = BioParser.get_structure( self.pdbID, pdir + 'pdb' + self.pdbID + '.ent') BioModel = BioStructure[0] Chain_AtomSeq = [] listMatches = [] for item in chains_to_check: pdbid_chain = e[3:7] + '_' + item BioChain = BioModel[item] residues = [] for residue in BioChain: for atom in residue: if atom.name == 'CA': aa1 = amino_dict.replace_all(residue.resname, amino_dict.one_letter) residues.append(aa1) req_res = [x for x in residues if x in amino_dict.amino] atom = "".join(req_res) Chain_AtomSeq.append((pdbid_chain, atom)) Chain_Dict = {} for k, v in Chain_AtomSeq: Chain_Dict.setdefault(k, v) # print (Chain_Dict) allChains = [i for i in Chain_Dict.values()] set_allChains = list(set(allChains)) # print (set_allChains) groups = {} for k, v in Chain_Dict.items(): groups.setdefault(v, []).append(k) matches = {k: v for k, v in groups.items()} list_of_matches = [i for i in matches.values()] # print (list_of_matches) listMatches.append(list_of_matches) req_matches = [i[0] for i in matches.values()] return sorted(req_matches), sorted(list_of_matches)
def main(): """Extract the x, y coordinates for a list of residues from the given PDB file on the command line. The residues are specified by their residue sequence positions in a file given on the command line. One position per line. A PDB file from which to extract coordinates is required on the command line. The extracted x, y coordinates are written to the output file in the CSV format. One pair of coordinates per line. """ # parse command-line arguments logging.info('Parsing command-line arguments ...') args = parse_cmd_args() # read in the list of residue IDs logging.info('Reading in the list of residue sequence positions from %s' % args.input) with open(args.input, 'rt') as ipf: res_ids = [int(i.strip()) for i in ipf] # parse the PDB file logging.info('Parsing the PDB file %s' % args.pdb) pdb_parser = PDBParser() pdb_id = os.path.basename(args.pdb).split('.')[0] structure = pdb_parser.get_structure(id=pdb_id, file=args.pdb) model = structure[0] # get x, y coordinates of alpha carbon for each residue logging.info('Extracting x, y coordinates ...') xy_coords = [] for i in res_ids: for r in model.get_residues(): if r.get_id()[1] == i: xy_coords.append(r['CA'].coord[:2]) # write to file logging.info('Writing extracted coordinates to %s' % args.output) with open(args.output, 'wt') as opf: all_coords_str = ['%.2f, %.2f' % tuple(coord) for coord in xy_coords] opf.write('\n'.join(all_coords_str))
def __init__(self, structure, name='structure', path='.'): try: from Bio.PDB import PDBParser, MMCIFParser from Bio.PDB.Entity import Entity except ModuleNotFoundError: raise ModuleNotFoundError( "BioPython is a required dependency for structure-related functions!" ) if isinstance(structure, str): file_type = (str(structure).split('.')[-1]).lower() if file_type in ('pdb', 'ent'): # load a PDB file __parser = PDBParser(PERMISSIVE=1, QUIET=True) self.structure = __parser.get_structure( name, os.path.join(path, structure)) elif file_type == 'cif': # load MMCIF file __parser = MMCIFParser(QUIET=True) self.structure = __parser.get_structure( name, os.path.join(path, structure)) else: raise ValueError( "Unknown filetype for structure file name: {}".format( structure)) elif isinstance(structure, Entity): # use structure as-is self.structure = structure else: raise ValueError( "Unknown type for input argument 'structure': {}".format( str(structure))) # properties self.name = name # cachable properties self.cache = {} self._atom_KDTree = None self._atom_list = None self._surface_residues = None
def dlSortedStrucs(prots: pd.DataFrame) -> str: ''' Downloads a set of structures from the above query using the PDB_dl_dir. ''' # check is the prots df is empty, if it is exit the function if prots.empty: return now = datetime.datetime.now() def now_dir_ts(): ''' Computes the timestamp for "now", when the query is called ''' now_ts = str(now.year) + "_" + str(now.month) + "_" + str( now.day) + "_" + str(now.hour) + "_" + str(now.minute) + "_" + str( now.second) return now_ts now = now_dir_ts() # get the time PDB_dl_dir = "ds_" + now # make the timestamp, save to the class variable parser = PDBParser() # create a parser pdbl = PDBList() # Download all PDB structures in the previous list if they aren't there for pdbid in prots[0]: # index the zeroth col pdbl.retrieve_pdb_file( pdb_code=pdbid, file_format='pdb', pdir=PDB_dl_dir) # Retrieve in PDB format, put in directory 'PDB' print('\n#############~DOWNLOAD COMPLETE~#############\n' ) # Finished, print "Downloading ... finished!" for file in os.scandir(PDB_dl_dir): if (file.path.endswith(".ent") and file.is_file()): newfn = file.name.replace("pdb", "").replace(".ent", ".pdb") os.rename(file, PDB_dl_dir + "/" + newfn) return
def build_complex(file_1, file_2): """ This function takes the complex output file (or in the first iteration one of the pairwise interactions) and another pairwise interaction PDB complex. Then it tries to add the chain to the complex until there is not clash @ Input - Two file path for a PDB interactions. @ Output - File path of the complex PDB file / Error: Chain cannot be added. """ parser = PDBParser(PERMISSIVE=1) structure_1 = parser.get_structure('Complex', file_1) structure_2 = parser.get_structure('Complex', file_2) sup = Superimposer() io = PDBIO() atoms_fixed, atoms_moving = Compute_equal_chain(structure_1, structure_2) try: sup.set_atoms(atoms_fixed, atoms_moving) except: return False sup.apply(list(structure_2.get_atoms())) for chain in structure_2[0].get_chains(): if chain.id != list(atoms_moving)[0].get_full_id()[2]: moved_chain = chain if check_clash(structure_1, moved_chain): with open(file_1, "wt") as out_file: for model in list(structure_1.get_chains()) + [moved_chain]: io.set_structure(model) io.save(out_file) rename_complex_chains(file_1) return True return False
def test_model_numbering(self): """Preserve model serial numbers during I/O.""" def confirm_numbering(struct): self.assertEqual(len(struct), 20) for idx, model in enumerate(struct): self.assertTrue(model.serial_num, idx + 1) self.assertTrue(model.serial_num, model.id + 1) parser = PDBParser() struct1 = parser.get_structure("1mot", "PDB/1MOT.pdb") confirm_numbering(struct1) # Round trip: serialize and parse again io = PDBIO() io.set_structure(struct1) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: io.save(filename) struct2 = parser.get_structure("1mot", filename) confirm_numbering(struct2) finally: os.remove(filename)
def pdb2fasta(pdb_file, num_chains=None): """Converts a PDB file to a fasta formatted string using its ATOM data""" pdb_id = basename(pdb_file).split('.')[0] parser = PDBParser() structure = parser.get_structure(pdb_id, pdb_file) real_num_chains = len([0 for _ in structure.get_chains()]) if num_chains is not None and num_chains != real_num_chains: print('WARNING: Skipping {}. Expected {} chains, got {}'.format( pdb_file, num_chains, real_num_chains)) return '' fasta = '' for chain in structure.get_chains(): id_ = chain.id seq = seq1(''.join([residue.resname for residue in chain])) fasta += '>{}:{}\t{}\n'.format(pdb_id, id_, len(seq)) max_line_length = 80 for i in range(0, len(seq), max_line_length): fasta += f'{seq[i:i + max_line_length]}\n' return fasta
def test_2(self): """Parse the header of another PDB file (2BEG).""" parser = PDBParser() struct = parser.get_structure("2BEG", "PDB/2BEG.pdb") known_strings = { "author": "T.Luhrs,C.Ritter,M.Adrian,D.Riek-Loher,B.Bohrmann,H.Dobeli,D.Schubert,R.Riek", "deposition_date": "2005-10-24", "head": "protein fibril", "journal": "AUTH T.LUHRS,C.RITTER,M.ADRIAN,D.RIEK-LOHER,B.BOHRMANN,AUTH 2 H.DOBELI,D.SCHUBERT,R.RIEKTITL 3D STRUCTURE OF ALZHEIMER'S AMYLOID-{BETA}(1-42)TITL 2 FIBRILS.REF PROC.NATL.ACAD.SCI.USA V. 102 17342 2005REFN ISSN 0027-8424PMID 16293696DOI 10.1073/PNAS.0506723102", "journal_reference": "t.luhrs,c.ritter,m.adrian,d.riek-loher,b.bohrmann, h.dobeli,d.schubert,r.riek 3d structure of alzheimer's amyloid-{beta}(1-42) fibrils. proc.natl.acad.sci.usa v. 102 17342 2005 issn 0027-8424 16293696 10.1073/pnas.0506723102 ", "keywords": "alzheimer's, fibril, protofilament, beta-sandwich, quenched hydrogen/deuterium exchange, pairwise mutagenesis, protein fibril", "name": "3d structure of alzheimer's abeta(1-42) fibrils", "release_date": "2005-11-22", "structure_method": "solution nmr", } for key, expect in known_strings.items(): self.assertEqual(struct.header[key].lower(), expect.lower())
def extract_structures(self, infolder): """ takes all files from outfolder and stores in self.structures list of objects """ os.system("python Modules/Trans/Pyry_cleanPDB.py -q -d " + str(infolder)) #run_cleanPDB(str(infolder), str(infolder)+"bla") pdb_files = glob.glob(str(infolder) + '/*.pdb') if len(pdb_files) == 0: raise PyRy3D_IG_Error("The files you provided are not pdb files") for pdbfile in pdb_files: parser = PDBParser(PERMISSIVE=False, QUIET=True) structure = parser.get_structure(str(pdbfile), pdbfile) print pdbfile filename = pdbfile.split("\\")[1] struc = InStructure(structure, filename) if len(list(structure.get_residues())) == 0: raise PyRy3D_IG_Error( "The file you provided for structure %s is not a valid pdb file" % (structure.id)) self.structures.append(struc)
def create(self, pdb): """ Creates the amino acid network using biographs""" mol = bg.Pmolecule(pdb) self.net = mol.network(cutoff=self.cutoff, weight=True) self.structure = PDBParser().get_structure('X', pdb)[0] # if self.pos1 and self.pos2: # for node in list(self.net.nodes): # pos = int(node[1::]) # if pos not in range(self.pos1, self.pos2): # self.net.remove_node(node) residues = [] for residue in self.structure.get_residues(): if residue.resname in self.three2one: residues.append(self.three2one[residue.resname]) else: residues.append(residue.resname) old_labels = self.net.nodes labels = [a + b[1:] + ':' + b[0] for a, b in zip(residues, old_labels)] mapping = dict(zip(old_labels, labels)) self.net = nx.relabel_nodes(self.net, mapping) return self.net
def split_pdb_by_chain(pdb_id): if not os.path.isdir("pdb_chains/" + pdb_id.upper()): os.mkdir("pdb_chains/" + pdb_id.upper()) actual_pdbfile = PDBParser().get_structure( pdb_id, "ent_files/pdb" + pdb_id.lower() + ".ent") return_dict = dict() for model in actual_pdbfile: for chain in model: outfilename = pdb_id.upper() + "-" + str( model.get_id() + 1) + "_" + str(chain.get_id()) + ".pdb" if not os.path.isfile("pdb_chains/" + pdb_id.upper() + "/" + outfilename): io = PDBIO() io.set_structure(chain) io.save("pdb_chains/" + pdb_id.upper() + "/" + outfilename) ppb = PPBuilder().build_peptides(chain) this_seq = Seq("", generic_protein) for pp in ppb: this_seq += pp.get_sequence() return_dict[outfilename] = this_seq return return_dict
def read(self, f_handle, f_id="pdb", distance_cutoff=8, atom_type='CB'): """Read a contact file Parameters ---------- f_handle Open file handle [read permissions] f_id : str, optional Unique contact file identifier distance_cutoff : int, optional Distance cutoff for which to determine contacts [default: 8] atom_type : str, optional Atom type between which distances are calculated [default: CB] Returns ------- :obj:`ContactFile <conkit.core.contactfile.ContactFile>` """ structure = PDBParser(QUIET=True).get_structure("pdb", f_handle) return self._read(structure, f_id, distance_cutoff, atom_type)
def get_mol_from_ligandpdb(ligand): if not os.path.exists('./pdb_files/'+ligand+'_ideal.pdb'): return None, None, None name_order_list = [] name_to_idx_dict, name_to_element_dict = {}, {} p = PDBParser() structure = p.get_structure(ligand, './pdb_files/'+ligand+'_ideal.pdb') for model in structure: for chain in model: chain_id = chain.get_id() for res in chain: if ligand == res.get_resname(): #print(ligand,res.get_resname(),res.get_full_id()) for atom in res: name_order_list.append(atom.get_id()) name_to_element_dict[atom.get_id()] = atom.element name_to_idx_dict[atom.get_id()] = atom.get_serial_number()-1 #print('check', name_to_idx_dict.items()) if len(name_to_idx_dict) == 0: return None, None, None return name_order_list, name_to_idx_dict, name_to_element_dict
def setUp(self): pdb_filename = "PDB/a_structure.pdb" with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = PDBParser(PERMISSIVE=True).get_structure( "X", pdb_filename) structure[1].detach_child("B") self.model = structure[1] # Look at first chain only a_residues = list(self.model["A"].child_list) self.assertEqual(86, len(a_residues)) self.assertEqual(a_residues[0].get_resname(), "CYS") self.assertEqual(a_residues[1].get_resname(), "ARG") self.assertEqual(a_residues[2].get_resname(), "CYS") self.assertEqual(a_residues[3].get_resname(), "GLY") # ... self.assertEqual(a_residues[-3].get_resname(), "TYR") self.assertEqual(a_residues[-2].get_resname(), "ARG") self.assertEqual(a_residues[-1].get_resname(), "CYS") self.a_residues = a_residues self.radius = 13.0
def get_atoms(pdb_file): """Returns an array with the atoms of the pdb file specified ###################################################################################### # # # This function makes use of the Biopython module, so it will not work if this # # module is not installed. # # This function reads a pdb file and extracts only the atoms of the first chain. # # # ###################################################################################### """ parser = PDBParser() structure = parser.get_structure(pdb_file[:-4], pdb_file) atms = structure.get_atoms() atoms = [] for a in atms: atoms.append(a.get_name()) if len(atoms) == 0: print('404 - No Atoms Found!!!') raise NoProtError('404 - No Atoms Found!!!') return atoms
def get_bonds(dir, atoms_list): p = PDBParser() pdbs = glob.glob(dir + '/*', recursive=True) points = [] for struct in pdbs: print(struct) structure = p.get_structure(struct.split('/')[1].split('.')[0], struct) for chain in structure[0]: # model1 i = 0 coords = [] for residue in chain: for atom in residue: if atom.get_id() == atoms_list[i]: coords.append(list(atom.get_vector())) i = (i + 1) % 2 coords = coords[:int(len(coords) / 2) * 2] coords = [coords[x:x + 100] for x in range(0, len(coords), 2)] for duo in coords: points.append(bond_length(duo)) return points
def check_NMR(): from Bio.PDB import PDBList, PDBParser import pandas as pd import os pdb_list = pd.read_csv('pdb_no_missing_residue.csv')['pdb'].values method_list = [] for pdb_id in pdb_list: if os.path.exists(f'pdbs/pdb{pdb_id[:4].lower()}.ent'): continue pdbl = PDBList() pdbl.retrieve_pdb_file(pdb_id[:4], pdir='./pdbs', file_format='pdb') if not os.path.exists(f'pdbs/pdb{pdb_id[:4].lower()}.ent'): method_list.append('no pdb') continue p = PDBParser() structure = p.get_structure('X', f'./pdbs/pdb{pdb_id[:4].lower()}.ent') method_list.append(structure.header['structure_method']) df = pd.DataFrame({'pdb': pdb_list, 'method': method_list}) df.to_csv('pdbs_methods.csv', index=False)
def get_around_residue(listresids, pdb, chainid, cutoff): """ Returns list of residues within cutoff distance of the one specified, and the number of residues in the chain_id specified """ structure = PDBParser(QUIET=True).get_structure('X', pdb) chain = structure[0][str(chainid)] center_residues = [chain[resi] for resi in listresids] center_atoms = Selection.unfold_entities(center_residues, str(chainid)) atom_list = [ atom for atom in structure[0][str(chainid)].get_atoms() if atom.name == 'CA' ] ns = NeighborSearch(atom_list) nearby_residues = { res for center_atom in center_atoms for res in ns.search(center_atom.coord, float(cutoff), 'R') } return sorted(res.id[1] for res in nearby_residues), len(atom_list)
def check_file_format(pdb_file: Union[str, Path], make_parser: bool = False): pdb_file = Path(pdb_file) file_format = '' code = pdb_file.stem # TODO: reliable check needs to peek into the file if pdb_file.stem.startswith('pdb') and pdb_file.suffix == '.ent': code = code[3:] file_format = 'pdb' elif pdb_file.suffix == '.cif': file_format = 'mmcif' if make_parser: if file_format == 'pdb': parser = PDBParser(PERMISSIVE=True, QUIET=True) elif file_format == 'mmcif': parser = MMCIFParser(QUIET=True) else: raise ValueError(f'parser does not support the file format: {str(pdb_file)}') return file_format, code, parser else: return file_format, code
def extract_beads(pdb_file): """ convert PDB to pandas dataframe :param pdb_file: :return: """ amino_acids = pd.read_csv('data/amino_acids.csv') vocab_aa = [x.upper() for x in amino_acids.AA3C] p = PDBParser() structure = p.get_structure('X', f'data/dock/pdb/{pdb_file}.pdb') residue_list = Selection.unfold_entities(structure, 'R') bead_center_list = [] res_name_list = [] res_num_list = [] chain_list = [] for res in residue_list: if res.get_resname() not in vocab_aa: # raise ValueError('protein has non natural amino acids') continue chain_list.append(res.parent.id) res_name_list.append(res.get_resname()) res_num_list.append(res.id[1]) bead_center = get_bead_center(res) bead_center_list.append(bead_center) g_center = np.vstack(bead_center_list) df = pd.DataFrame({ 'chain_id': chain_list, 'group_num': res_num_list, 'group_name': res_name_list, 'x': g_center[:, 0], 'y': g_center[:, 1], 'z': g_center[:, 2] }) df.to_csv(f'data/dock/beads/{pdb_file}_bead.csv', index=False)
def testBioPDB(self): try: from Bio.PDB import PDBParser except ImportError: print("Can't import Bio.PDB, tests skipped") pass else: parser = PDBParser(QUIET=True) bp_structure = parser.get_structure("Ubiquitin","lib/tests/data/1a0q.pdb") s1 = structureFromBioPDB(bp_structure) s2 = Structure("lib/tests/data/1a0q.pdb") self.assertTrue(s1.nAtoms() == s2.nAtoms()) # make sure we got the insertion code self.assertEqual(s1.residueNumber(2286), '82A') for i in range(0, s2.nAtoms()): self.assertTrue(s1.radius(i) == s2.radius(i)) # there can be tiny errors here self.assertTrue(math.fabs(s1.coord(i)[0] - s2.coord(i)[0]) < 1e-5) self.assertTrue(math.fabs(s1.coord(i)[1] - s2.coord(i)[1]) < 1e-5) self.assertTrue(math.fabs(s1.coord(i)[2] - s2.coord(i)[2]) < 1e-5) # whitespace won't match self.assertIn(s1.residueNumber(i), s2.residueNumber(i)) # because Bio.PDB structures will have slightly different # coordinates (due to rounding errors) we set the # tolerance as high as 1e-3 result = calc(s1, Parameters({'algorithm' : LeeRichards, 'n-slices' : 20})) self.assertTrue(math.fabs(result.totalArea() - 18923.280586) < 1e-3) sasa_classes = classifyResults(result, s1) self.assertTrue(math.fabs(sasa_classes['Polar'] - 9143.066411) < 1e-3) self.assertTrue(math.fabs(sasa_classes['Apolar'] - 9780.2141746) < 1e-3) result, sasa_classes = calcBioPDB(bp_structure, Parameters({'algorithm' : ShrakeRupley})) self.assertTrue(math.fabs(result.totalArea() - 18908.900192) < 1e-3) self.assertTrue(math.fabs(sasa_classes['Polar'] - 9120.7423269) < 1e-3) self.assertTrue(math.fabs(sasa_classes['Apolar'] - 9788.157865) < 1e-3)
def get_dfrominterface(pdb_fh): """ This calculates distances between each ligand atom or optionally provided amino acids (sources) and each residue in the protein. :param pdb_fh: path to .pdb file. :returns dinter: pandas table with distances from dimer interface """ junk_residues = [ "HOH", " MG", "CA", " NA", "SO4", "IOD", "NA", "CL", "GOL", "PO4" ] pdb_parser = PDBParser() pdb_data = pdb_parser.get_structure("pdb_name", pdb_fh) model = pdb_data[0] if len(model.child_dict) == 2: chainA = model["A"] #only a chain chainB = model["B"] #only a chain def get_resobjs(chainA): ligands_residue_objs = [] for residue in chainA: if not residue.get_resname() in junk_residues: if residue.get_resname() in aas_21_3letter: #only aas ligands_residue_objs.append(residue) return ligands_residue_objs chainA_resobjs = get_resobjs(chainA) chainB_resobjs = get_resobjs(chainB) resobjs_tups = zip(chainA_resobjs, chainB_resobjs) dfrominter = pd.DataFrame(columns=['Distance from dimer interface']) for tup in resobjs_tups: resA = tup[0] resB = tup[1] if resA.get_id()[1] == resB.get_id()[1]: dfrominter.loc[resA.get_id()[1],'Distance from dimer interface']=\ (resA['CA']-resB['CA'])/2 dfrominter.index.name = 'refi' return dfrominter
def protein_dist_angle_matrix(pdb_file, mask=None): p = PDBParser() file_name = splitext(basename(pdb_file))[0] structure = p.get_structure(file_name, pdb_file) residues = [r for r in structure.get_residues()] def get_cb_or_ca_coord(residue): if 'CB' in residue: return residue['CB'].get_coord() elif 'CA' in residue: return residue['CA'].get_coord() else: return [0, 0, 0] def get_atom_coord(residue, atom_type): if atom_type in residue: return residue[atom_type].get_coord() else: return [0, 0, 0] cb_ca_coords = torch.tensor([get_cb_or_ca_coord(r) for r in residues]) ca_coords = torch.tensor([get_atom_coord(r, 'CA') for r in residues]) cb_coords = torch.tensor([get_atom_coord(r, 'CB') for r in residues]) n_coords = torch.tensor([get_atom_coord(r, 'N') for r in residues]) cb_mask = torch.ByteTensor([1 if sum(_) != 0 else 0 for _ in cb_coords]) if mask is None: mask = torch.ByteTensor([1] * len(cb_coords)) output_matrix = torch.stack([ generate_dist_matrix(cb_ca_coords, mask=mask), generate_cb_cb_dihedral(ca_coords, cb_coords, mask=(mask & cb_mask)), generate_ca_cb_dihedral(ca_coords, cb_coords, n_coords, mask=(mask & cb_mask)), generate_ca_cb_cb_planar(ca_coords, cb_coords, mask=(mask & cb_mask)) ]) return output_matrix
def test_1_warnings(self): """Check warnings: Parse a flawed PDB file in permissive mode. NB: The try/finally block is adapted from the warnings.catch_warnings context manager in the Python 2.6 standard library. """ warnings.simplefilter('always', PDBConstructionWarning) try: # Equivalent to warnings.catch_warnings -- hackmagic orig_showwarning = warnings.showwarning all_warns = [] def showwarning(*args, **kwargs): all_warns.append(args[0]) warnings.showwarning = showwarning # Trigger warnings p = PDBParser(PERMISSIVE=True) p.get_structure("example", "PDB/a_structure.pdb") self.assertEqual(len(all_warns), 14) for wrn, msg in zip(all_warns, [ # Expected warning messages: "Used element 'N' for Atom (name=N) with given element ''", "Used element 'C' for Atom (name=CA) with given element ''", "Atom names ' CA ' and 'CA ' differ only in spaces at line 17.", "Used element 'CA' for Atom (name=CA ) with given element ''", 'Atom N defined twice in residue <Residue ARG het= resseq=2 icode= > at line 21.', 'disordered atom found with blank altloc before line 33.', "Residue (' ', 4, ' ') redefined at line 43.", "Blank altlocs in duplicate residue SER (' ', 4, ' ') at line 43.", "Residue (' ', 10, ' ') redefined at line 75.", "Residue (' ', 14, ' ') redefined at line 106.", "Residue (' ', 16, ' ') redefined at line 135.", "Residue (' ', 80, ' ') redefined at line 633.", "Residue (' ', 81, ' ') redefined at line 646.", 'Atom O defined twice in residue <Residue HOH het=W resseq=67 icode= > at line 822.' ]): self.assertTrue(msg in str(wrn), str(wrn)) finally: warnings.showwarning = orig_showwarning
def pdb_sequence(pdb_file, id=None, method="order"): from Bio.PDB import PDBParser, CaPPBuilder from Bio.PDB.Polypeptide import three_to_one if id is None: id = util.make_id_from_file_name(pdb_file) parser = PDBParser() structure = parser.get_structure(id, pdb_file) seq_chains = [] for chain in structure.get_chains(): id_chain = chain.get_id() if method == "distance": ppb = CaPPBuilder() seq = sum((pp.get_sequence() for pp in ppb.build_peptides(chain)), Seq("", IUPAC.protein)) seq_spec = None #TODO: implement elif method == "order": seq = [] seq_spec = [] for res in chain.get_residues(): seq.append(three_to_one(res.get_resname())) ## from Bio docs, res.get_full_id() returns: ("1abc", 0, "A", (" ", 10, "A")) fid = res.get_full_id() seq_spec.append( pdb_seq_spec(chain=fid[-2].strip(), resn=res.get_resname(), resi=fid[-1][-2], ins=fid[-1][-1].strip())) seq = Seq("".join(seq), IUPAC.protein) else: raise ValueError("Unknown method: {}".format(method)) seq_chains.append( dict(id_chain=id_chain, seq_rec=SeqRecord(seq, id="{}_{}".format(id, id_chain), description=""), seq_spec=seq_spec)) chains_map = dict(((x["id_chain"], x) for x in seq_chains)) return pdb_seqs(id=id, chains=seq_chains, chains_map=chains_map)
def SecStr(pdb_id, chain_id, start, stop): #Change pdb_id to lower cases - as in local pdb db. pdb_id = pdb_id.lower() #Read pdb structure if it exists. p = PDBParser() try: structure = p.get_structure(pdb_id, f'/home/m.pak/pdb/pdb{pdb_id}.pdb') except FileNotFoundError: print(f'File not found, proceed... {pdb_id}') return None, None, None model = structure[0] #Run DSSP. try: dssp = DSSP(model, f'/home/m.pak/pdb/pdb{pdb_id}.pdb') except: print(f'DSSP unable to process the structure {pdb_id}, proceed...') return None, None, None #Keep annotation of secondaty structure elements, Phi and Psi angles for defined region of structure. sec_str = '' phi_lst = [] psi_lst = [] #INCLUDES STOP!!!! for num in range(start, stop+1): try: res_key = (chain_id, (' ', num, ' ')) #Can not deal with hetero-flag and insertion code res = dssp[res_key] except: print(f'{res_key} not found in {pdb_id}, proceed...') continue sec_str += res[2] phi_lst.append(res[4]) psi_lst.append(res[5]) return sec_str, phi_lst, psi_lst
def test_StructAlign(self): """Tests on module to align two proteins according to a FASTA alignment file.""" p = PDBParser(QUIET=1) al_file = "PDB/alignment_file.fa" with open(al_file) as handle: records = AlignIO.read(handle, "fasta") s1 = p.get_structure("1", "PDB/2XHE.pdb") s2 = p.get_structure("2", "PDB/1A8O.pdb") m1 = s1[0] m2 = s2[0] al = StructureAlignment(records, m1, m2) self.assertNotEqual(al.map12, al.map21) self.assertTrue(len(al.map12), 566) self.assertTrue(len(al.map21), 70) chain1_A = m1["A"] chain2_A = m2["A"] self.assertEqual(chain1_A[202].get_resname(), "ILE") self.assertEqual(chain2_A[202].get_resname(), "LEU") self.assertEqual(chain1_A[291].get_resname(), chain2_A[180].get_resname()) self.assertNotEqual(chain1_A[291].get_resname(), chain2_A[181].get_resname())