def retrieve_ca_model(structure): """ chains are represented only by main chain atoms (Calfas or C4') """ reduced_struct = Structure('clustering_model') my_model = Model(0) reduced_struct.add(my_model) main_chain_atoms = [] for ch in structure[0]: my_chain = Chain(ch.id) reduced_struct[0].add(my_chain) for resi in ch: for atom in resi: #print "----", resi.id, resi.get_segid(), ch.id if atom.get_name() == "CA" or atom.get_name( ) == "C4'" or atom.get_name() == "C4*": my_residue = Residue((' ', resi.id[1], ' '), resi.get_resname(), ' ') atom = Atom('CA', atom.coord, 0, ' ', ' ', 'CA', atom.get_serial_number()) my_chain.add(my_residue) my_residue.add(atom) main_chain_atoms.append(atom) return reduced_struct
def renumber_windowed_model(self, structure: Structure, alphafold_mmCIF_dict: Dict) -> Structure: # Grab the Alphafold dictionary entry that descrives the residue range in the structure seq_db_align_begin = int(alphafold_mmCIF_dict['_ma_target_ref_db_details.seq_db_align_begin'][0]) seq_db_align_end = int(alphafold_mmCIF_dict['_ma_target_ref_db_details.seq_db_align_end'][0]) # start empty renumbered_structure = Structure(structure.id) for model in structure: renumbered_model = Model(model.id) for chain in model: transcript_residue_number = seq_db_align_begin renumbered_chain = Chain(chain.id) for residue in chain: renumbered_residue = residue.copy() renumbered_residue.id = (' ', transcript_residue_number, ' ') # The above copy routines fail to copy disorder properly - so just wipe out all notion of disorder for atom in renumbered_residue: atom.disordered_flag = 0 renumbered_residue.disordered = 0 renumbered_chain.add(renumbered_residue) transcript_residue_number += 1 assert transcript_residue_number == seq_db_align_end + 1 renumbered_model.add(renumbered_chain) renumbered_structure.add(renumbered_model) return renumbered_structure
def create_new_chain(self, old_struct): s = Structure(old_struct.chain) my_model = Model(0) s.add(my_model) my_chain = Chain(old_struct.chain) my_model.add(my_chain) #what if more chains in one component? return s
def create_sphere_representation(self): """ each chain is here represented by centre of mass only """ new_struct = Structure('sphrere') my_model = Model(0) new_struct.add(my_model) chain_mass_centres, index = [], 1 my_chain = Chain(self.fa_struct.chain) new_struct[0].add(my_chain) coord, self.molmass, self.radius = self.calculate_centre_of_complex( self.fa_struct.struct) my_residue = Residue((' ', index, ' '), "ALA", ' ') coords = array(coord, 'f') atom = Atom('CA', coords, 0, 0, ' ', ' CA', 1) my_chain.add(my_residue) my_residue.add(atom) self.cg_struct = new_struct name = "dddd" + self.fa_struct.chain self.save_pdb(new_struct, name)
def splitOnePDB(fname, outPath): try: s= parser.get_structure(fname, fname) except Exception: print ("Error loading pdb") return 0 banLenChains=[] try: for chain in s[0]: badResInChain=0 for res in chain.get_list(): if not is_aa(res,standard=True): badResInChain+=1 chainLen= sum(1 for res in chain if "CA" in res) - badResInChain if chainLen < MIN_SEQ_LEN or chainLen > MAX_SEQ_LEN: print(chainLen) banLenChains.append(chain.get_id()) except KeyError: print ("Not good model") return 0 for badChainId in banLenChains: s[0].detach_child(badChainId) receptorChainList= [] ligandChainList= [] if len( s[0].get_list())<2: print(s) print( s[0].get_list()) print("Not enough good chains") return 0 for chain1 in s[0]: tmpReceptorList=[] for chain2 in s[0]: if chain1!= chain2: tmpReceptorList.append(chain2) if len(tmpReceptorList)>1 or not tmpReceptorList[0] in ligandChainList: ligandChainList.append(chain1) receptorChainList.append(tmpReceptorList) prefix= os.path.basename(fname).split(".")[0] for i, (ligandChain, receptorChains) in enumerate(zip(ligandChainList, receptorChainList)): io=PDBIO() ligandStruct= Structure(prefix+"ligand") ligandStruct.add(Model(0)) ligandChain.set_parent(ligandStruct[0]) ligandStruct[0].add(ligandChain) io.set_structure(ligandStruct) io.save(os.path.join(outPath,prefix+"-"+str(i)+"_l_u.pdb")) io=PDBIO() receptorStruct= Structure(prefix+"receptor") receptorStruct.add(Model(0)) for receptorChain in receptorChains: receptorChain.set_parent(receptorStruct[0]) receptorStruct[0].add(receptorChain) io.set_structure(receptorStruct) io.save(os.path.join(outPath,prefix+"-"+str(i)+"_r_u.pdb")) print( "ligand:", ligandChain, "receptor:",receptorChains )
def calculate_BSA(self): "Uses NACCESS module in order to calculate the Buried Surface Area" # Extract list of chains in the interface only chains = list(self.get_chains()) # Create temporary structures to feed NACCESS structure_A=Structure("chainA") structure_B=Structure("chainB") mA = Model(0) mB = Model(0) mA.add(self.model[chains[0]]) mB.add(self.model[chains[1]]) structure_A.add(mA) structure_B.add(mB) # Calculate SASAs NACCESS_atomic(self.model) NACCESS_atomic(structure_A[0]) NACCESS_atomic(structure_B[0]) sas_tot= _get_atomic_SASA(self.model) #print 'Accessible surface area, complex:', sas_tot sas_A= _get_atomic_SASA(structure_A) #print 'Accessible surface aream CHAIN A :', sas_A sas_B= _get_atomic_SASA(structure_B) #print 'Accessible surface aream CHAIN B :',sas_B # Calculate BSA bsa = sas_A+sas_B-sas_tot return [bsa, sas_A, sas_B, sas_tot]
def getStructFromFasta(self, fname, chainType): ''' Creates a Bio.PDB.Structure object from a fasta file contained in fname. Atoms are not filled and thus no coordiantes availables. Implements from Structure to Residue hierarchy. :param fname: str. path to fasta file @chainType: str. "l" or "r" ''' seq = self.parseFasta( fname, inputNumber="1" if chainType == "l" else "2") #inpuNumber is used to report which partner fails if error prefix = self.splitExtendedPrefix(self.getExtendedPrefix(fname))[0] chainId = chainType.upper() residues = [] struct = Structure(prefix) model = Model(0) struct.add(model) chain = Chain(chainId) model.add(chain) for i, aa in enumerate(seq): try: resname = one_to_three(aa) except KeyError: resname = "UNK" res = Residue((' ', i, ' '), resname, prefix) chain.add(res) return struct
def slice(cls, obj, selection, name='slice'): """Create a new Structure object 'S2' from a slice of the current one, 'S1'. <selection> defines which descendents 'S1' will be stored in 'S2'.""" from Bio.PDB.Structure import Structure from Bio.PDB.Model import Model from Bio.PDB.Chain import Chain ent = Structure(name) # Biopython structure object # Loop over selection and determine what model/chain objects we need to create in order to # store the slice models = {} for item in selection: mid = item[1] cid = item[2] if mid not in models: models[mid] = set() # store chain ids models[mid].add(cid) # Create model/chains to store slice for mid in models: ent.add(Model(mid)) for cid in models[mid]: ent[mid].add(Chain(cid)) # Add residues to slice for item in selection: mid = item[1] cid = item[2] rid = item[3] ent[mid][cid].add(obj[mid][cid][rid].copy()) return cls(ent, name=name)
def retrieve_sphere_model(structure): #, score): """ each chain is here represented by centre of mass only """ sphere_struct = Structure('clustering_model') my_model = Model(0) sphere_struct.add(my_model) #bedzie zmieniona numeracja chain_mass_centres, index = [], 0 for chain in structure.get_chains(): my_chain = Chain(chain.id) sphere_struct[0].add(my_chain) coord = calculate_centre_of_complex(chain) chain_mass_centres.append(coord) my_residue = Residue((' ', index, ' '), chain.id, ' ') coords = array(coord, 'f') atom = Atom('CA', coords, 0, 0, ' ', 'CA', 1) my_chain.add(my_residue) my_residue.add(atom) index += 1 del structure return sphere_struct
def _rsa_calculation(self, model, chain_list, rsa_threshold): "Uses NACCESS module in order to calculate the Buried Surface Area" pairs=[] # Create temporary structures to feed NACCESS structure_A=Structure("chainA") structure_B=Structure("chainB") mA = Model(0) mB = Model(0) mA.add(model[chain_list[0]]) mB.add(model[chain_list[1]]) structure_A.add(mA) structure_B.add(mB) # Calculate SASAs nacc_at=NACCESS(model) model_values=[] res_list = [r for r in model.get_residues() if r.id[0] == ' '] structure_A_reslist =[r for r in structure_A[0].get_residues() if r.id[0] == ' '] structure_B_reslist =[r for r in structure_B[0].get_residues() if r.id[0] == ' '] for res in res_list: model_values.append(float(res.xtra['EXP_NACCESS']['all_atoms_rel'])) sas_tot= self._get_residue_SASA(model) #print 'Accessible surface area, complex:', sas_tot nacc_at=NACCESS(structure_A[0]) nacc_at=NACCESS(structure_B[0]) submodel_values=[] for res in structure_A_reslist: if res.id[0]==' ': submodel_values.append(float(res.xtra['EXP_NACCESS']['all_atoms_rel'])) for res in structure_B_reslist: if res.id[0]==' ': submodel_values.append(float(res.xtra['EXP_NACCESS']['all_atoms_rel'])) count=0 for res in res_list: if res in structure_A_reslist and ((submodel_values[count] - model_values[count]) > rsa_threshold): pairs.append(res) elif res in structure_B_reslist and ((submodel_values[count] - model_values[count]) > rsa_threshold): pairs.append(res) count=count+1 sas_A= self._get_residue_SASA(structure_A) #print 'Accessible surface aream CHAIN A :', sas_A sas_B= self._get_residue_SASA(structure_B) #print 'Accessible surface aream CHAIN B :',sas_B # Calculate BSA bsa = sas_A+sas_B-sas_tot self.interface.accessibility=[bsa, sas_A, sas_B, sas_tot] return pairs
def save_chain_to(chain, filename: str): from Bio.PDB.PDBIO import PDBIO io = PDBIO() # io.set_structure(chain.get_bio_chain()) structure = Structure(filename) structure.add(chain) io.set_structure(structure) io.save(filename)
def initialize_res(residue: Union[Geo, str]) -> Structure: """Creates a new structure containing a single amino acid. The type and geometry of the amino acid are determined by the argument, which has to be either a geometry object or a single-letter amino acid code. The amino acid will be placed into chain A of model 0.""" if isinstance(residue, Geo): geo = residue elif isinstance(residue, str): geo = geometry(residue) else: raise ValueError("Invalid residue argument:", residue) segID = 1 AA = geo.residue_name CA_N_length = geo.CA_N_length CA_C_length = geo.CA_C_length N_CA_C_angle = geo.N_CA_C_angle CA_coord = np.array([0.0, 0.0, 0.0]) C_coord = np.array([CA_C_length, 0, 0]) N_coord = np.array([ CA_N_length * math.cos(N_CA_C_angle * (math.pi / 180.0)), CA_N_length * math.sin(N_CA_C_angle * (math.pi / 180.0)), 0, ]) N = Atom("N", N_coord, 0.0, 1.0, " ", " N", 0, "N") # Check if the peptide is capped or not if geo.residue_name == "ACE": CA = Atom("CH3", CA_coord, 0.0, 1.0, " ", " CH3", 0, "C") else: CA = Atom("CA", CA_coord, 0.0, 1.0, " ", " CA", 0, "C") C = Atom("C", C_coord, 0.0, 1.0, " ", " C", 0, "C") ##Create Carbonyl atom (to be moved later) C_O_length = geo.C_O_length CA_C_O_angle = geo.CA_C_O_angle N_CA_C_O_diangle = geo.N_CA_C_O_diangle carbonyl = calculateCoordinates(N, CA, C, C_O_length, CA_C_O_angle, N_CA_C_O_diangle) O = Atom("O", carbonyl, 0.0, 1.0, " ", " O", 0, "O") res = make_res_of_type(segID, N, CA, C, O, geo) cha = Chain("A") cha.add(res) mod = Model(0) mod.add(cha) struc = Structure("X") struc.add(mod) return struc
def splitOnePDB(fname, chainIdL, chainIdR, outPath): print(os.path.basename(fname)) try: s = parser.get_structure(os.path.basename(fname), fname) except Exception: print("Error loading pdb") return 0 banLenChains = [] try: for chain in s[0]: badResInChain = 0 for res in chain.get_list(): if not is_aa(res, standard=True) and res.resname != "HOH": badResInChain += 1 # for res in chain: print(res) chainLen = sum(1 for res in chain if "CA" in res) - badResInChain if chainLen < MIN_SEQ_LEN or chainLen > MAX_SEQ_LEN: print(chain, chainLen) banLenChains.append(chain.get_id()) except KeyError: print("Not good model") return 0 # print(banLenChains) if len(s[0].get_list()) - len(banLenChains) < 2: print(s) print(s[0].get_list()) print("Not enough good chains") return 0 ligandChains, receptorChains = findNeigChains(s, chainIdL, chainIdR) print("ligand:", ligandChains, "receptor:", receptorChains) prefix = os.path.basename(fname).split(".")[0] io = PDBIO() ligandStruct = Structure(prefix + "ligand") ligandStruct.add(Model(0)) for ligandChain in ligandChains: ligandChain.set_parent(ligandStruct[0]) ligandStruct[0].add(ligandChain) io.set_structure(ligandStruct) io.save( os.path.join(outPath, prefix + "-" + chainIdL + chainIdR + "_l_u.pdb")) io = PDBIO() receptorStruct = Structure(prefix + "receptor") receptorStruct.add(Model(0)) for receptorChain in receptorChains: receptorChain.set_parent(receptorStruct[0]) receptorStruct[0].add(receptorChain) io.set_structure(receptorStruct) io.save( os.path.join(outPath, prefix + "-" + chainIdL + chainIdR + "_r_u.pdb"))
def get_structure(self, name='RNA chain'): """Returns chain as a PDB.Structure object.""" struc = Structure(name) model = Model(0) chain = Chain(self.chain_name) struc.add(model) struc[0].add(chain) for resi in self: struc[0][self.chain_name].add(resi) return struc
def single_chain_structure(chain, name='superposition'): from Bio.PDB.Structure import Structure from Bio.PDB.Model import Model structure = Structure(name) model = Model(0) structure.add(model) model.add(chain) return structure
def complex_save(given_complex, i, path): s = Structure(i) my_model = Model(0) s.add(my_model) for component in given_complex.components: my_model.add( component.pyrystruct.struct[0][component.pyrystruct.chain]) out = PDBIO() out.set_structure(s) out.save(path) return path
def extract_model(pdb_struct, k): """ Extract a model from the given PDB structure. """ assert k < len(pdb_struct), 'missing specified model' new_struct = Structure(pdb_struct.id) new_model = pdb_struct[k].copy() new_model.id = 0 new_model.serial_num = 1 new_struct.add(new_model) return new_struct
def multiply_model(pdb_struct, num_models): """ Given a single-model PDB structure, multiply that model. """ assert len(pdb_struct) == 1, 'single-model PDB file required' new_struct = Structure(pdb_struct.id) for i in range(num_models): new_model = pdb_struct[0].copy() new_model.detach_parent() new_model.id = i new_model.serial_num = i + 1 new_struct.add(new_model) new_model.set_parent(new_struct) return new_struct
def initialize_res(residue): '''Creates a new structure containing a single amino acid. The type and geometry of the amino acid are determined by the argument, which has to be either a geometry object or a single-letter amino acid code. The amino acid will be placed into chain A of model 0.''' if isinstance( residue, Geo ): geo = residue else: geo= Geo(residue) segID=1 AA= geo.residue_name CA_N_length=geo.CA_N_length CA_C_length=geo.CA_C_length N_CA_C_angle=geo.N_CA_C_angle CA_coord= np.array([0.,0.,0.]) C_coord= np.array([CA_C_length,0,0]) N_coord = np.array([CA_N_length*math.cos(N_CA_C_angle*(math.pi/180.0)),CA_N_length*math.sin(N_CA_C_angle*(math.pi/180.0)),0]) N= Atom("N", N_coord, 0.0 , 1.0, " "," N", 0, "N") CA=Atom("CA", CA_coord, 0.0 , 1.0, " "," CA", 0,"C") C= Atom("C", C_coord, 0.0, 1.0, " ", " C",0,"C") ##Create Carbonyl atom (to be moved later) C_O_length=geo.C_O_length CA_C_O_angle=geo.CA_C_O_angle N_CA_C_O_diangle=geo.N_CA_C_O_diangle carbonyl=calculateCoordinates(N, CA, C, C_O_length, CA_C_O_angle, N_CA_C_O_diangle) O= Atom("O",carbonyl , 0.0 , 1.0, " "," O", 0, "O") res=makeRes(segID, N, CA, C, O, geo) cha= Chain('A') cha.add(res) mod= Model(0) mod.add(cha) struc= Structure('X') struc.add(mod) return struc
def save_pdb(self, complex_id, temp = "", name = ""): """ gets coordinates of all complex components and writes them in one file one component = one pdb model Parameters: ------------ complex_id : number of complex from simulation Returns: -------- pdb files with simulated components in OUTFOLDER """ ##add component chain by chain not residue by residue. model_num = 0 score = round(self.simulation_score, 4) s = Structure(complex_id) my_model = Model(0) s.add(my_model) for component in self.components: #@TODO: #what if more chains in one component? my_model.add(component.pyrystruct.struct[0][component.pyrystruct.chain]) out = PDBIO() out.set_structure(s) outname = outfolder.outdirname.split("/")[-1] temp = str(temp) try: temp = round(float(temp),1) except: pass if name: fi_name = str(outfolder.outdirname)+'/'+name+'_'+str(score)+'_'+str(complex_id)+"_"+str(temp)+'.pdb' out.save(fi_name) else: fi_name = str(outfolder.outdirname)+'/'+str(outname)+"_"+str(score)+'_'+str(complex_id)+"_"+str(temp)+'.pdb' out.save(fi_name) for comp in self.components: comp.pyrystruct.struct[0][comp.pyrystruct.chain].detach_parent() return fi_name
def __make_structure_from_residues__(self, residues): """ Makes a Structure object either from a pdbfile or a list of residues """ # KR: this probably can be outsourced to another module. struct = Structure('s') model = Model('m') n_chain = 1 chain = Chain('c%i' % n_chain) for residue in residues: if chain.has_id(residue.id): model.add(chain) n_chain += 1 chain = Chain('c%i' % n_chain) chain.add(residue) model.add(chain) struct.add(model) return struct
def select_structure(selector, structure): new_structure = Structure(structure.id) for model in structure: if not selector.accept_model(model): continue new_model = Model(model.id, model.serial_num) new_structure.add(new_model) for chain in model: if not selector.accept_chain(chain): continue new_chain = Chain(chain.id) new_model.add(new_chain) for residue in chain: if not selector.accept_residue(residue): continue new_residue = Residue(residue.id, residue.resname, residue.segid) new_chain.add(new_residue) for atom in residue: if selector.accept_atom(atom): new_residue.add(atom) return new_structure
def create_structure(coords, pdb_type, remove_masked): """Create the structure. Args: coords: 3D coordinates of structure pdb_type: predict or actual structure remove_masked: whether to include masked atoms. If false, the masked atoms have coordinates of [0,0,0]. Returns: structure """ name = protein.id_ structure = Structure(name) model = Model(0) chain = Chain('A') for i, residue in enumerate(protein.primary): residue = AA_LETTERS[residue] if int(protein.mask[i]) == 1 or remove_masked == False: new_residue = Residue((' ', i + 1, ' '), residue, ' ') j = 3 * i atom_list = ['N', 'CA', 'CB'] for k, atom in enumerate(atom_list): new_atom = Atom(name=atom, coord=coords[j + k, :], bfactor=0, occupancy=1, altloc=' ', fullname=" {} ".format(atom), serial_number=0) new_residue.add(new_atom) chain.add(new_residue) model.add(chain) structure.add(model) io = PDBIO() io.set_structure(structure) io.save(save_dir + name + '_' + pdb_type + '.pdb') return structure
def createPDBFile(self): "Create test CIF file with 12 Atoms in icosahedron vertexes" from Bio.PDB.Structure import Structure from Bio.PDB.Model import Model from Bio.PDB.Chain import Chain from Bio.PDB.Residue import Residue from Bio.PDB.Atom import Atom from Bio.PDB.mmcifio import MMCIFIO import os CIFFILENAME = "/tmp/out.cif" # create atom struct with ico simmety (i222r) icosahedron = Icosahedron(circumscribed_radius=100, orientation='222r') pentomVectorI222r = icosahedron.getVertices() # create biopython object structure = Structure('result') # structure_id model = Model(1, 1) # model_id,serial_num structure.add(model) chain = Chain('A') # chain Id model.add(chain) for i, v in enumerate(pentomVectorI222r, 1): res_id = (' ', i, ' ') # first arg ' ' -> aTOm else heteroatom res_name = "ALA" #+ str(i) # define name of residue res_segid = ' ' residue = Residue(res_id, res_name, res_segid) chain.add(residue) # ATOM name, coord, bfactor, occupancy, altloc, fullname, serial_number, # element=None) atom = Atom('CA', v, 0., 1., " ", " CA ", i, "C") residue.add(atom) io = MMCIFIO() io.set_structure(structure) # delete file if exists if os.path.exists(CIFFILENAME): os.remove(CIFFILENAME) io.save(CIFFILENAME) return CIFFILENAME
class StructWriter(): def __init__(self, structId="subset"): self.structId = structId self.pdbParser = PDBParser(QUIET=True) self.structure = Structure(structId) def addModel(self, pdb_as_str, modelId): pdbLikeFile = StringIO.StringIO() pdbLikeFile.write(pdb_as_str) pdbLikeFile.flush() pdbLikeFile.seek(0, 0) # print( "--->", pdbLikeFile.getvalue()) new_struct = self.pdbParser.get_structure(str(modelId), pdbLikeFile) print(new_struct.child_list) model = new_struct[0] model.detach_parent() model.id = int(modelId) model.serial_num = model.id model.get_full_id() self.structure.add(model) print("Current struct", self.structure.child_list) def saveStruct(self, fname, desiredOrder): io = PDBIO(use_model_flag=True) if desiredOrder is not None: children = self.structure.child_list self.structure = Structure(self.structId) for modelId in desiredOrder: child = [model for model in children if model.id == modelId][0] child.detach_parent() self.structure.add(child) io.set_structure(self.structure) io.save(fname) #, preserve_atom_numbering=True) def __len__(self): return len(self.structure.child_list)
class StructureBuilder: """Deals with constructing the Structure object. The StructureBuilder class is used by the PDBParser classes to translate a file to a Structure object. """ def __init__(self): """Initialize the class.""" self.line_counter = 0 self.header = {} def _is_completely_disordered(self, residue): """Return 1 if all atoms in the residue have a non blank altloc (PRIVATE).""" atom_list = residue.get_unpacked_list() for atom in atom_list: altloc = atom.get_altloc() if altloc == " ": return 0 return 1 # Public methods called by the Parser classes def set_header(self, header): """Set header.""" self.header = header def set_line_counter(self, line_counter): """Tracks line in the PDB file that is being parsed. Arguments: - line_counter - int """ self.line_counter = line_counter def init_structure(self, structure_id): """Initialize a new Structure object with given id. Arguments: - id - string """ self.structure = Structure(structure_id) def init_model(self, model_id, serial_num=None): """Create a new Model object with given id. Arguments: - id - int - serial_num - int """ self.model = Model(model_id, serial_num) self.structure.add(self.model) def init_chain(self, chain_id): """Create a new Chain object with given id. Arguments: - chain_id - string """ if self.model.has_id(chain_id): self.chain = self.model[chain_id] warnings.warn( "WARNING: Chain %s is discontinuous at line %i." % (chain_id, self.line_counter), PDBConstructionWarning, ) else: self.chain = Chain(chain_id) self.model.add(self.chain) def init_seg(self, segid): """Flag a change in segid. Arguments: - segid - string """ self.segid = segid def init_residue(self, resname, field, resseq, icode): """Create a new Residue object. Arguments: - resname - string, e.g. "ASN" - field - hetero flag, "W" for waters, "H" for hetero residues, otherwise blank. - resseq - int, sequence identifier - icode - string, insertion code """ if field != " ": if field == "H": # The hetero field consists of H_ + the residue name (e.g. H_FUC) field = "H_" + resname res_id = (field, resseq, icode) if field == " ": if self.chain.has_id(res_id): # There already is a residue with the id (field, resseq, icode). # This only makes sense in the case of a point mutation. warnings.warn( "WARNING: Residue ('%s', %i, '%s') redefined at line %i." % (field, resseq, icode, self.line_counter), PDBConstructionWarning, ) duplicate_residue = self.chain[res_id] if duplicate_residue.is_disordered() == 2: # The residue in the chain is a DisorderedResidue object. # So just add the last Residue object. if duplicate_residue.disordered_has_id(resname): # The residue was already made self.residue = duplicate_residue duplicate_residue.disordered_select(resname) else: # Make a new residue and add it to the already # present DisorderedResidue new_residue = Residue(res_id, resname, self.segid) duplicate_residue.disordered_add(new_residue) self.residue = duplicate_residue return else: if resname == duplicate_residue.resname: warnings.warn( "WARNING: Residue ('%s', %i, '%s','%s') already defined " "with the same name at line %i." % (field, resseq, icode, resname, self.line_counter), PDBConstructionWarning, ) self.residue = duplicate_residue return # Make a new DisorderedResidue object and put all # the Residue objects with the id (field, resseq, icode) in it. # These residues each should have non-blank altlocs for all their atoms. # If not, the PDB file probably contains an error. if not self._is_completely_disordered(duplicate_residue): # if this exception is ignored, a residue will be missing self.residue = None raise PDBConstructionException( "Blank altlocs in duplicate residue %s ('%s', %i, '%s')" % (resname, field, resseq, icode) ) self.chain.detach_child(res_id) new_residue = Residue(res_id, resname, self.segid) disordered_residue = DisorderedResidue(res_id) self.chain.add(disordered_residue) disordered_residue.disordered_add(duplicate_residue) disordered_residue.disordered_add(new_residue) self.residue = disordered_residue return self.residue = Residue(res_id, resname, self.segid) self.chain.add(self.residue) def init_atom( self, name, coord, b_factor, occupancy, altloc, fullname, serial_number=None, element=None, pqr_charge=None, radius=None, is_pqr=False, ): """Create a new Atom object. Arguments: - name - string, atom name, e.g. CA, spaces should be stripped - coord - Numeric array (Float0, size 3), atomic coordinates - b_factor - float, B factor - occupancy - float - altloc - string, alternative location specifier - fullname - string, atom name including spaces, e.g. " CA " - element - string, upper case, e.g. "HG" for mercury - pqr_charge - float, atom charge (PQR format) - radius - float, atom radius (PQR format) - is_pqr - boolean, flag to specify if a .pqr file is being parsed """ residue = self.residue # if residue is None, an exception was generated during # the construction of the residue if residue is None: return # First check if this atom is already present in the residue. # If it is, it might be due to the fact that the two atoms have atom # names that differ only in spaces (e.g. "CA.." and ".CA.", # where the dots are spaces). If that is so, use all spaces # in the atom name of the current atom. if residue.has_id(name): duplicate_atom = residue[name] # atom name with spaces of duplicate atom duplicate_fullname = duplicate_atom.get_fullname() if duplicate_fullname != fullname: # name of current atom now includes spaces name = fullname warnings.warn( "Atom names %r and %r differ only in spaces at line %i." % (duplicate_fullname, fullname, self.line_counter), PDBConstructionWarning, ) if not is_pqr: self.atom = Atom( name, coord, b_factor, occupancy, altloc, fullname, serial_number, element, ) elif is_pqr: self.atom = Atom( name, coord, None, None, altloc, fullname, serial_number, element, pqr_charge, radius, ) if altloc != " ": # The atom is disordered if residue.has_id(name): # Residue already contains this atom duplicate_atom = residue[name] if duplicate_atom.is_disordered() == 2: duplicate_atom.disordered_add(self.atom) else: # This is an error in the PDB file: # a disordered atom is found with a blank altloc # Detach the duplicate atom, and put it in a # DisorderedAtom object together with the current # atom. residue.detach_child(name) disordered_atom = DisorderedAtom(name) residue.add(disordered_atom) disordered_atom.disordered_add(self.atom) disordered_atom.disordered_add(duplicate_atom) residue.flag_disordered() warnings.warn( "WARNING: disordered atom found with blank altloc before " "line %i.\n" % self.line_counter, PDBConstructionWarning, ) else: # The residue does not contain this disordered atom # so we create a new one. disordered_atom = DisorderedAtom(name) residue.add(disordered_atom) # Add the real atom to the disordered atom, and the # disordered atom to the residue disordered_atom.disordered_add(self.atom) residue.flag_disordered() else: # The atom is not disordered residue.add(self.atom) def set_anisou(self, anisou_array): """Set anisotropic B factor of current Atom.""" self.atom.set_anisou(anisou_array) def set_siguij(self, siguij_array): """Set standard deviation of anisotropic B factor of current Atom.""" self.atom.set_siguij(siguij_array) def set_sigatm(self, sigatm_array): """Set standard deviation of atom position of current Atom.""" self.atom.set_sigatm(sigatm_array) def get_structure(self): """Return the structure.""" # first sort everything # self.structure.sort() # Add the header dict self.structure.header = self.header return self.structure def set_symmetry(self, spacegroup, cell): """Set symmetry.""" pass
class StructureBuilder(object): """ Deals with contructing the Structure object. The StructureBuilder class is used by the PDBParser classes to translate a file to a Structure object. """ def __init__(self): self.line_counter=0 self.header={} def _is_completely_disordered(self, residue): "Return 1 if all atoms in the residue have a non blank altloc." atom_list=residue.get_unpacked_list() for atom in atom_list: altloc=atom.get_altloc() if altloc==" ": return 0 return 1 # Public methods called by the Parser classes def set_header(self, header): self.header=header def set_line_counter(self, line_counter): """ The line counter keeps track of the line in the PDB file that is being parsed. Arguments: o line_counter - int """ self.line_counter=line_counter def init_structure(self, structure_id): """Initiate a new Structure object with given id. Arguments: o id - string """ self.structure=Structure(structure_id) def init_model(self, model_id, serial_num = None): """Initiate a new Model object with given id. Arguments: o id - int o serial_num - int """ self.model=Model(model_id,serial_num) self.structure.add(self.model) def init_chain(self, chain_id): """Initiate a new Chain object with given id. Arguments: o chain_id - string """ if self.model.has_id(chain_id): self.chain=self.model[chain_id] warnings.warn("WARNING: Chain %s is discontinuous at line %i." % (chain_id, self.line_counter), PDBConstructionWarning) else: self.chain=Chain(chain_id) self.model.add(self.chain) def init_seg(self, segid): """Flag a change in segid. Arguments: o segid - string """ self.segid=segid def init_residue(self, resname, field, resseq, icode): """ Initiate a new Residue object. Arguments: o resname - string, e.g. "ASN" o field - hetero flag, "W" for waters, "H" for hetero residues, otherwise blank. o resseq - int, sequence identifier o icode - string, insertion code """ if field!=" ": if field=="H": # The hetero field consists of H_ + the residue name (e.g. H_FUC) field="H_"+resname res_id=(field, resseq, icode) if field==" ": if self.chain.has_id(res_id): # There already is a residue with the id (field, resseq, icode). # This only makes sense in the case of a point mutation. warnings.warn("WARNING: Residue ('%s', %i, '%s') " "redefined at line %i." % (field, resseq, icode, self.line_counter), PDBConstructionWarning) duplicate_residue=self.chain[res_id] if duplicate_residue.is_disordered()==2: # The residue in the chain is a DisorderedResidue object. # So just add the last Residue object. if duplicate_residue.disordered_has_id(resname): # The residue was already made self.residue=duplicate_residue duplicate_residue.disordered_select(resname) else: # Make a new residue and add it to the already # present DisorderedResidue new_residue=Residue(res_id, resname, self.segid) duplicate_residue.disordered_add(new_residue) self.residue=duplicate_residue return else: # Make a new DisorderedResidue object and put all # the Residue objects with the id (field, resseq, icode) in it. # These residues each should have non-blank altlocs for all their atoms. # If not, the PDB file probably contains an error. if not self._is_completely_disordered(duplicate_residue): # if this exception is ignored, a residue will be missing self.residue=None raise PDBConstructionException( "Blank altlocs in duplicate residue %s ('%s', %i, '%s')" % (resname, field, resseq, icode)) self.chain.detach_child(res_id) new_residue=Residue(res_id, resname, self.segid) disordered_residue=DisorderedResidue(res_id) self.chain.add(disordered_residue) disordered_residue.disordered_add(duplicate_residue) disordered_residue.disordered_add(new_residue) self.residue=disordered_residue return residue=Residue(res_id, resname, self.segid) self.chain.add(residue) self.residue=residue def init_atom(self, name, coord, b_factor, occupancy, altloc, fullname, serial_number=None, element=None): """ Initiate a new Atom object. Arguments: o name - string, atom name, e.g. CA, spaces should be stripped o coord - Numeric array (Float0, size 3), atomic coordinates o b_factor - float, B factor o occupancy - float o altloc - string, alternative location specifier o fullname - string, atom name including spaces, e.g. " CA " o element - string, upper case, e.g. "HG" for mercury """ residue=self.residue # if residue is None, an exception was generated during # the construction of the residue if residue is None: return # First check if this atom is already present in the residue. # If it is, it might be due to the fact that the two atoms have atom # names that differ only in spaces (e.g. "CA.." and ".CA.", # where the dots are spaces). If that is so, use all spaces # in the atom name of the current atom. if residue.has_id(name): duplicate_atom=residue[name] # atom name with spaces of duplicate atom duplicate_fullname=duplicate_atom.get_fullname() if duplicate_fullname!=fullname: # name of current atom now includes spaces name=fullname warnings.warn("Atom names %r and %r differ " "only in spaces at line %i." % (duplicate_fullname, fullname, self.line_counter), PDBConstructionWarning) atom=self.atom=Atom(name, coord, b_factor, occupancy, altloc, fullname, serial_number, element) if altloc!=" ": # The atom is disordered if residue.has_id(name): # Residue already contains this atom duplicate_atom=residue[name] if duplicate_atom.is_disordered()==2: duplicate_atom.disordered_add(atom) else: # This is an error in the PDB file: # a disordered atom is found with a blank altloc # Detach the duplicate atom, and put it in a # DisorderedAtom object together with the current # atom. residue.detach_child(name) disordered_atom=DisorderedAtom(name) residue.add(disordered_atom) disordered_atom.disordered_add(atom) disordered_atom.disordered_add(duplicate_atom) residue.flag_disordered() warnings.warn("WARNING: disordered atom found " "with blank altloc before line %i.\n" % self.line_counter, PDBConstructionWarning) else: # The residue does not contain this disordered atom # so we create a new one. disordered_atom=DisorderedAtom(name) residue.add(disordered_atom) # Add the real atom to the disordered atom, and the # disordered atom to the residue disordered_atom.disordered_add(atom) residue.flag_disordered() else: # The atom is not disordered residue.add(atom) def set_anisou(self, anisou_array): "Set anisotropic B factor of current Atom." self.atom.set_anisou(anisou_array) def set_siguij(self, siguij_array): "Set standard deviation of anisotropic B factor of current Atom." self.atom.set_siguij(siguij_array) def set_sigatm(self, sigatm_array): "Set standard deviation of atom position of current Atom." self.atom.set_sigatm(sigatm_array) def get_structure(self): "Return the structure." # first sort everything # self.structure.sort() # Add the header dict self.structure.header=self.header return self.structure def set_symmetry(self, spacegroup, cell): pass
class PdbSite: """M-CSA PDB catalytic site. Contains lists of PdbResidues and mapped UniProt catalytic sites (UniSite objects), a 3D structure (Biopython Structure) built from individual PdbResidue structures (Biopython Residue), a parent structure (Biopython Structure), all and close to the site hetero components (possible ligands according to their chemical similarity to the cognate ligand and their centrality in the active site) as Het objects (containing a Biopython Residue structure), as well as a dictionary of annotations extracted from the parent mmCIF assembly structure file and SIFTS""" def __init__(self): self.parent_entry = None self.residues = [] self.residues_dict = {} self.mapped_unisites = [] self.reference_site = None self.parent_structure = None self.structure = None self.ligands = [] self.mmcif_dict = dict() self.is_sane = None def __str__(self): """Show as pseudo-sequence in one-letter code""" return self.sequence def __len__(self): """Return size of site (residue count)""" return self.size def __iter__(self): """Iterate over residues""" yield from self.residues def __eq__(self, other): """Check if sites contain the same residues (same IDs)""" if len(self) == len(other): for res in other: if res.full_id not in self.residues_dict: return False return True return False def __contains__(self, residue): """Check if residue is there""" return residue.full_id in self.residues_dict def __getitem__(self, full_id): """Return the residue with given ID.""" return self.residues_dict[full_id] # Alternative constructors @classmethod def from_list(cls, reslist, cif_path, parent_entry, annotate=True): """Construct PdbSite object directly from residue list""" mmcif_dict = dict() # First reduce redundant residues with multiple function locations reslist = PdbSite._cleanup_list(reslist) site = cls() site.parent_entry = parent_entry try: if annotate: parser = MMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) mmcif_dict = parser._mmcif_dict else: parser = FastMMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) except (TypeError, PDBConstructionException): warnings.warn( 'Could not build site from residue list. Check entry', RuntimeWarning) return for res in reslist: if structure: res.add_structure(structure) site.add(res) if annotate: site.parent_structure = structure site.mmcif_dict = mmcif_dict site.find_ligands() return site @classmethod def build_reference(cls, reslist, parent_entry, cif_path, annotate=True): """Builds reference active site from a list of PDB catalytic residues. Assumes that the list only contains one active site, so use it only if it is a list of manually annotated catalytic residues""" ref = PdbSite.from_list(reslist, cif_path, parent_entry, annotate) ref.reference_site = ref ref.is_sane = True return ref @classmethod def build(cls, seed, reslist, reference_site, parent_entry): """Builds active site from a list of catalytic residues that may form multiple active sites (e.g. all residues annotated as catalytic in a PDB structure). Using a residue as seed, it starts building an active site by checking the euclidean distances of all residues that have the same resid and name. In the end, it maps the site to the reference defined in the args""" site = cls() if seed.structure is None: return for res in reslist: candidate = PdbSite._get_nearest_equivalent( res, seed, reslist, site) if candidate is None: continue if candidate not in site: site.add(candidate) site.reference_site = reference_site site.parent_entry = parent_entry site._map_reference_residues() return site @classmethod def build_all(cls, reslist, reference_site, parent_entry, cif_path, annotate=True, redundancy_cutoff=None): """Builds all sites in using as input a list of catalytic residues. Returns a list of PdbSite objects""" # Map structure objects in every residue sites = [] mmcif_dict = dict() try: if annotate: parser = MMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) mmcif_dict = parser._mmcif_dict else: parser = FastMMCIFParser(QUIET=True) structure = parser.get_structure('', cif_path) except (TypeError, PDBConstructionException): warnings.warn('Could not parse structure {}'.format( cif_path, RuntimeWarning)) return sites # First reduce redundant residues with multiple function locations reslist = PdbSite._cleanup_list(reslist) # We want all equivalent residues from identical assembly chains reslist = PdbSite._get_assembly_residues(reslist, structure) # Get seeds to build active sites seeds = PdbSite._get_seeds(reslist) # Build a site from each seed for seed in seeds: sites.append(cls.build(seed, reslist, reference_site, parent_entry)) # Reduce redundancy sites = PdbSite._remove_redundant_sites(sites, cutoff=redundancy_cutoff) # Add ligands and annotations if annotate and structure: for site in sites: site.parent_structure = structure site.mmcif_dict = mmcif_dict site.find_ligands() # Flag unclustered sites PdbSite._mark_unclustered(sites) return sites # Properties @property def mcsa_id(self): """Get M-CSA ID of catalytic residues.""" for res in self.residues: if res.mcsa_id: return res.mcsa_id return @property def pdb_id(self): """Get PDB ID of catalytic residues. Not a unique site ID""" for res in self.residues: if res.pdb_id: return res.pdb_id return @property def uniprot_id(self): """Get UniProt ID of the chain of the first residue""" for res in self.residues: if res.chain: try: return PDB2UNI[(self.pdb_id, res.chain[0])] except KeyError: continue return @property def ec(self): """Get EC number from SIFTS""" for res in self.residues: if res.chain: try: return PDB2EC[(self.pdb_id, res.chain[0])] except KeyError: try: return PDB2EC[(self.pdb_id, res.alt_chain[0])] except KeyError: continue return @property def sequence(self): """Show as pseudo-sequence in one-letter code""" return ''.join([ AA_3TO1[res.resname] if (res.is_standard or res.is_gap) else 'X' for res in self ]) @property def title(self): """Return title of PDB entry""" try: return self.mmcif_dict['_struct.title'][0] except KeyError: return @property def enzyme(self): """Return enzyme name""" try: return self.mmcif_dict['_struct.pdbx_descriptor'][0] except KeyError: return @property def assembly_id(self): """Return PDB assembly ID""" try: return int(self.mmcif_dict['_entity_poly.assembly_id'][0][-1]) except (TypeError, KeyError): return @property def experimental_method(self): """Return structure determination method""" try: return self.mmcif_dict['_exptl.method'][0] except KeyError: return @property def resolution(self): """Return resolution in Angstrom""" try: if 'nmr' in self.experimental_method.lower(): return elif 'microscopy' in self.experimental_method.lower(): return float( self.mmcif_dict['_em_3d_reconstruction.resolution'][0]) else: return float(self.mmcif_dict['_refine.ls_d_res_high'][0]) except (TypeError, KeyError, AttributeError): return @property def organism_name(self): """Return name of organism of origin""" try: return self.mmcif_dict['_entity_src_nat.pdbx_organism_scientific'][ 0] except KeyError: try: return self.mmcif_dict[ '_entity_src_gen.pdbx_gene_src_scientific_name'][0] except KeyError: return @property def organism_id(self): """Return id of organism of origin""" try: return self.mmcif_dict['_entity_src_nat.pdbx_ncbi_taxonomy_id'][0] except KeyError: try: return self.mmcif_dict[ '_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id'][0] except KeyError: return @property def id(self): """Unique ID of the active site. Consists of PDB ID and a string of chain IDs of all residues""" return '{}_{}'.format(self.pdb_id, '-'.join(res.chain for res in self.residues)) @property def size(self): """Get site size in residue count""" return len(self.residues) @property def acts_on_polymer(self): """Check if it belongs to a family of enzymes whose substrate is a polymer (protein or nucleic)""" return self.parent_entry.info['reaction']['is_polymeric'] @property def is_reference(self): """Check if site is reference site""" if self.size > 0: return self.residues[0].is_reference return False @property def is_conserved(self): """Check if all residues are conserved by comparing to the reference""" if self.is_reference: return True return str(self) == str(self.reference_site) @property def is_conservative_mutation(self, ignore_funcloc_main=True): """Checks if the mutations in the site are conservative. Option to ignore residues that function via main chain""" result = False for res in self.residues: if ignore_funcloc_main: if res.has_main_chain_function or res.has_double_funcloc: result = True continue if not res.is_conserved and not res.is_conservative_mutation: return False if res.is_conservative_mutation: result = True return result @property def has_missing_functional_atoms(self): """Checks if there are missing functional atoms from the residue structures or site is empty""" try: gaps = set(self.get_gaps()) for i, res in enumerate(self): if i in gaps: continue func_atoms, _ = res.get_func_atoms() if len(func_atoms) != 3: return True return False except (TypeError, ValueError): return True # Methods def copy(self, include_structure=True): """Returns a copy of the site. If include_structure is False, then the structure is not copied""" site = copy(self) if include_structure: site.structure = self.structure.copy() return site def add(self, residue): """Add PdbResidue object to site (in the residues list and dict)""" residue = residue.copy(include_structure=True) if type(residue) == PdbResidue: self.residues.append(residue) self.residues_dict[residue.full_id] = residue residue.parent_site = self if type(residue) == Het: self.ligands.append(residue) residue.parent_site = self if residue.is_polymer: if residue.chain in self.structure[0]: for r in residue.structure: self.structure[0][residue.chain].add(r) return True self.structure[0].add(residue.structure) return True if residue.structure: # Initialize structure if empty if self.structure is None: self.structure = Structure(self.id) self.structure.add(Model(0)) chain_id = residue.structure.get_parent().get_id() if chain_id not in self.structure[0]: self.structure[0].add(Chain(chain_id)) # Add residue structure to site structure if residue.structure.get_id() not in self.structure[0][chain_id]: self.structure[0][chain_id].add(residue.structure) return True def get_distances(self, kind='com'): """Calculates all intra-site residue distances and returns a numpy array""" dists = [] seen = set() for p in self.residues: for q in self.residues: if p == q or (q.full_id, p.full_id) in seen: continue if p.is_gap or q.is_gap: dists.append(np.nan) else: dists.append(p.get_distance(q, kind)) seen.add((p.full_id, q.full_id)) return np.array(dists) def get_residues(self): """To iterate over catalytic residues""" yield from self.residues def get_gaps(self): """Returns an index of the gap positions (non-aligned residues)""" gaps = [] for i, res in enumerate(self.residues): if res.is_gap: gaps.append(i) return gaps def contains_equivalent(self, res): """Checks if the site contains a catalytic residue of the basic info (name, resid, auth_resid), and either the same chiral_id or chain""" for sres in self: if sres.is_equivalent(res, by_chiral_id=True) or \ sres.is_equivalent(res, by_chiral_id=False, by_chain=True): return True return False def has_identical_residues(self, other): """Checks if two sites have the same residues, although their order might be different. Used to cleanup redundant symmetrical active sites like HIV-protease""" for res in other: if not self.contains_equivalent(res): return False return True def get_chiral_residues(self): """Gets chiral residues from the site if there are any (residues that have the same resname, resid, auth_resid but different chains)""" identicals = set() for i, p in enumerate(self): for j, q in enumerate(self): if p == q or q.is_gap or q.is_gap: continue if p.is_equivalent(q, by_chiral_id=False, by_chain=False): if (j, i) not in identicals: identicals.add((i, j)) return identicals def find_ligands(self, radius=3): """ Searches the parent structure for hetero components close to the catalytic residues, by searching around the atoms of catalytic residues and the dummy atoms between distant residues. Populates the ligands list with Het objects. Args: radius: the search space (in Å) around the atoms of the catalytic residues """ if type(self.parent_structure) != Structure: return # Get centers of search centers = self._get_ligand_search_centers(radius) # Initialize KD tree query_atoms = Bio.PDB.Selection.unfold_entities( self.parent_structure, 'A') ns = NeighborSearch(query_atoms) # Search for ligands around each center polymers = defaultdict(list) site_chains = set([res.chain for res in self]) seen = set() added = set() # Search for components close to catalytic residues for center in centers: hits = ns.search(center, radius, level='R') for res in hits: if res.get_full_id() in seen: continue seen.add(res.get_full_id()) restype = res.get_id()[0][0] chain = res.get_parent().get_id() # Ignore waters if restype == 'W': continue # HET components if restype == 'H': self.add( Het(self.mcsa_id, self.pdb_id, res.get_resname(), res.get_id()[1], chain, structure=res, parent_site=self)) added.add(res.get_full_id()) # Protein/nucleic polymer components if restype == ' ' and chain not in site_chains: polymers[chain].append(res) # Build polymers if self.acts_on_polymer: for chain, reslist in polymers.items(): self.add( Het.polymer(reslist, self.mcsa_id, self.pdb_id, chain, self)) # Find distal co-factor-like or substrate-like molecules hits = ns.search(self.structure.center_of_mass(geometric=True), 30, level='R') for res in hits: restype = res.get_id()[0][0] if restype == 'H' and res.get_full_id() not in added: ligand = Het(self.mcsa_id, self.pdb_id, res.get_resname(), res.get_id()[1], res.get_parent().get_id(), structure=res, parent_site=self) if ligand.type in ('Substrate (non-polymer)', 'Co-factor (non-ion)'): ligand.is_distal = True self.add(ligand) return def write_pdb(self, outdir=None, outfile=None, write_hets=False, func_atoms_only=False, include_dummy_atoms=False): """ Writes site coordinates in PDB format Args: write_hets: Include coordinates of ligands. outdir: Directory to save the .pdb file outfile: If unspecified, name is formatted to include info on M-CSA ID, chain of each catalytic residue, annotation if the site is a reference site and an annotation about the conservation, relatively to the reference (c: conserved, m: mutated, cm: has only conservative mutation) """ if not outdir: outdir = '.' if not outfile: conservation = 'm' if self.is_conservative_mutation: conservation = 'cm' if self.is_conserved: conservation = 'c' if func_atoms_only: atms = 'func' else: atms = 'all' if self.is_sane: sanity = 'sane' else: sanity = 'insane' outfile = '{}/mcsa_{}.{}.{}.{}.{}.{}.pdb'.format( outdir.rstrip('/'), str(self.mcsa_id).zfill(4), self.id, 'reference' if self.is_reference else 'cat_site', conservation, atms, sanity) with open(outfile, 'w') as o: if bool(self.mmcif_dict): ligands = ','.join( '{0.resname};{0.resid};{0.chain};{0.similarity};{0.centrality};{0.type}' .format(h) for h in self.ligands) remarks = ( 'REMARK CATALYTIC SITE\n' 'REMARK ID {0.id}\n' 'REMARK PDB_ID {0.pdb_id}\n' 'REMARK ASSEMBLY_ID {0.assembly_id}\n' 'REMARK UNIPROT_ID {0.uniprot_id}\n' 'REMARK EC {0.ec}\n' 'REMARK TITLE {0.title}\n' 'REMARK ENZYME {0.enzyme}\n' 'REMARK EXPERIMENTAL_METHOD {0.experimental_method}\n' 'REMARK RESOLUTION {0.resolution}\n' 'REMARK ORGANISM_NAME {0.organism_name}\n' 'REMARK ORGANISM_ID {0.organism_id}\n' 'REMARK NEARBY_LIGANDS {1}'.format(self, ligands)) print(remarks, file=o) residues = self.residues.copy() if write_hets: residues += self.ligands for res in residues: if not include_dummy_atoms and res.is_gap: continue structure = res.structure if res.dummy_structure: structure = res.dummy_structure if structure is not None: for atom in structure.get_atoms(): resname = res.resname.upper() if res.has_main_chain_function or not res.is_standard: resname = 'ANY' funcstring = '{}.{}'.format(resname, atom.get_id().upper()) if func_atoms_only and type( res ) == PdbResidue and funcstring not in RESIDUE_DEFINITIONS: continue pdb_line = '{:6}{:5d} {:<4}{}{:>3}{:>2}{:>4}{:>12.3f}' \ '{:>8.3f}{:>8.3f} {:6}'.format( 'HETATM' if (atom.get_parent().get_id()[0] != ' ' or type(res) == Het) else 'ATOM', int(atom.get_serial_number()) if atom.get_serial_number() else 0, atom.name if len(atom.name) == 4 else ' {}'.format(atom.name), 'Z' if funcstring in RESIDUE_DEFINITIONS else atom.get_altloc(), atom.get_parent().get_resname(), atom.get_parent().get_parent().get_id(), atom.get_parent().get_id()[1], atom.get_coord()[0], atom.get_coord()[1], atom.get_coord()[2], atom.get_occupancy() if atom.get_occupancy() else '') print(pdb_line, file=o) print('END', file=o) def fit(self, other, weighted=False, cycles=1, cutoff=999, scaling_factor=None, transform=False, mutate=True, reorder=True, allow_symmetrics=True, exclude=None, get_array=False): """Iteratively fits two catalytic sites (self: fixed site, other: mobile site) using the Kabsch algorithm from the rmsd module (https://github.com/charnley/rmsd). Can also find the optimal atom alignment in each residue, considering symmetrical atoms and functionally similar residues, using the Hungarian algorithm. Args: other: mobile active site to fit weighted: to perform weighted superposition in the last iteration cycles: Number of fitting iterations to exclude outlying atoms transform: Also transforms the mobile site's coordinates mutate: If the two active sites do not have the same residues, make pseudo-mutations to the mobile site to facilitate atom correspondence reorder: Find the optimal atom correspondence (within a residue) between the two sites, taking into account conservative mutations and symmetrical atoms (optional). See and definitions in residue_definitions.py module. allow_symmetrics: Allows flipping of side chains if atoms are equivalent or symmetrical Returns: rot, tran, rms, rms_all rot: Rotation matrix to transform mobile site into the fixed site tran: Translation vector to transform mobile site into the fixed site rms: RMSD after fitting, excluding outliers rms_all: RMSD over all atoms, including outliers Raises: Exception: If number of functions atoms in the two sites is not the same (e.g. if there are missing atoms from the parent structure) """ # In case gaps are present, exclude those positions gaps = set(self.get_gaps() + other.get_gaps()) # If we want to exclude residues from fitting if exclude is not None: if type(exclude) not in (list, tuple, set): exclude = [exclude] for i in exclude: gaps.add(i) # Get atom identifier strings and coords as numpy arrays p_atoms, p_coords = self._get_func_atoms(allow_symmetrics, omit=gaps) q_atoms, q_coords = other._get_func_atoms(allow_symmetrics, omit=gaps) if p_atoms is None or q_atoms is None: return None, None, None, None if len(p_atoms) != len(q_atoms): raise Exception('Atom number mismatch in sites {} and {}'.format( self.id, other.id)) # Initial crude superposition rot, tran, rms, _ = PdbSite._super(p_coords, q_coords, cycles=1) q_trans = PdbSite._transform(q_coords, rot, tran) # In case of non-conservative mutations, make pseudo-mutations to facilitate superposition if mutate: for i, (p_atom, q_atom) in enumerate(zip(p_atoms, q_atoms)): if p_atom != q_atom: #q_atoms[i] = p_atom q_atoms[i] = '{}.MUT'.format(q_atoms[i].split('.')[0]) p_atoms[i] = '{}.MUT'.format(p_atoms[i].split('.')[0]) # Reorder atoms using the Hungarian algorithm from rmsd package if reorder: q_review = reorder_hungarian(p_atoms, q_atoms, p_coords, q_trans) q_coords = q_coords[q_review] # Iterative superposition. Get rotation matrix, translation vector and RMSD rot, tran, rms, rms_all = PdbSite._super(p_coords, q_coords, cycles, cutoff, weighted, scaling_factor) if transform: other.structure.transform(rot, tran) if get_array: q_trans = np.dot(q_coords, rot) + tran return rot, tran, rms, rms_all, p_coords, q_trans return rot, tran, rms, rms_all def per_residue_rms(self, other, rot=None, tran=None, transform=False): """Calculates the RMSD of each residue in two superimposed sites. If superposition rotation matrix and translation vector are not given, RMSD is calculated without transformation. Otherwise, fitting is performed automatically, using weighted superposition to compensate for bias caused by slightly outlying residues.""" rmsds = [] if rot is None or tran is None: rot, tran, _, _ = self.fit(other, weighted=True, transform=False) for i, (p, q) in enumerate(zip(self, other)): if p.is_gap or q.is_gap: rmsds.append(np.nan) continue # Get functional atoms p_atoms, p_coords = p.get_func_atoms() q_atoms, q_coords = q.get_func_atoms() # Mutate if there are mismatches for i, (p_atom, q_atom) in enumerate(zip(p_atoms, q_atoms)): if p_atom != q_atom: p_atoms[i] = 'MUT' q_atoms[i] = 'MUT' # Transform functional atoms if transform: q_coords = PdbSite._transform(q_coords, rot, tran) # Reorder q_review = reorder_hungarian(p_atoms, q_atoms, p_coords, q_coords) q_coords = q_coords[q_review] # Calculate RMSD rms = PdbSite._rmsd(p_coords, q_coords) rmsds.append(np.round(rms, 3)) return np.array(rmsds) # Private methods def _map_reference_residues(self): """Puts each residue in the site in the correct order, according to the reference site, using the individual residue mapping to a reference residue. Wherever a mapping cannot be found, an empty residue is assigned to that position""" if self.reference_site is None: return for reference_residue in self.reference_site: found = False for res in self: if reference_residue == res.reference_residue: found = True if not found: gap = PdbResidue(mcsa_id=self.mcsa_id, pdb_id=self.pdb_id, chiral_id=reference_residue.chiral_id, dummy_structure=True) gap.reference_residue = reference_residue self.add(gap) self._reorder() return def _get_func_atoms(self, allow_symmetrics=True, omit=None): """Gets atoms and coordinates for superposition and atom reordering calculations Args: allow_symmetrics: If True, equivalent residues and atoms get the same id string, according to the definitions in residue_definitions.py (EQUIVALENT_ATOMS) omit: Residues to exclude Returns: atoms: A NumPy array of atom identifier strings of type 'N.RES.AT' where N is the residue serial number in the .pdb file (consistent among all sites), RES is the residue name and AT is the atom name coords: A NumPy array of the atomic coordinates """ atoms = [] coords = [] for i, res in enumerate(self): if omit: if i in omit: continue if not res.structure: return np.array(atoms), np.array(coords) for atom in res.structure: resname = res.resname.upper() if allow_symmetrics: if res.has_main_chain_function: resname = 'ANY' if not res.is_standard: resname = 'PTM' atmid = '{}.{}'.format(resname, atom.name) if atmid in RESIDUE_DEFINITIONS: if allow_symmetrics: if atmid in EQUIVALENT_ATOMS: atmid = EQUIVALENT_ATOMS[atmid] atoms.append('{}.{}'.format(i, atmid)) coords.append(atom.get_coord()) try: atoms = np.array(atoms, dtype=object) coords = np.stack(coords, axis=0) except ValueError: return None, None return atoms, coords def _reorder(self): """Residue reordering routine for _map_reference_residues""" if self.reference_site is None: return reorder = [] for i, reference_residue in enumerate(self.reference_site): for j, res in enumerate(self): if i == j and reference_residue == res.reference_residue: reorder.append(i) elif i != j and reference_residue == res.reference_residue: reorder.append(j) self.residues = [self.residues[i] for i in reorder] # If site contains chiral residues, reorder them by chain chiral = self.get_chiral_residues() if chiral: for pair in chiral: p = self.residues[pair[0]] q = self.residues[pair[1]] if q.chain < p.chain: self.residues[pair[0]], self.residues[pair[1]] = \ self.residues[pair[1]], self.residues[pair[0]] return def _get_ligand_search_centers(self, radius=4): """Gets atom coordinates from catalytic residues, and interpolates the empty space between distant residues, by calculating the center of geometry of the two residues. Radius is used to identify distant residues in between which an extra center will be added.""" centers = [] seen = set() for p in self: if p.is_gap or p.structure is None: continue for atom in p.structure.get_unpacked_list(): centers.append(atom.get_coord()) p_centroid = p.structure.center_of_mass(geometric=True) for q in self: if p is q or (q.id, p.id) in seen or q.is_gap or q.structure is None: continue seen.add((p.id, q.id)) q_centroid = q.structure.center_of_mass(geometric=True) dist = p.get_distance(q, kind='min') if 2 * radius <= dist <= 4 * radius: dummy_coords = np.mean([p_centroid, q_centroid], axis=0) centers.append(dummy_coords) return centers @staticmethod def _cleanup_list(reslist): """Finds duplicate residues of different funclocs and makes a single one with two funclocs. Returns a new list without redundant residues""" new_reslist = [] seen = set() ignore = set() for p in reslist: for q in reslist: if p == q or (q.full_id, p.full_id) in seen: continue if p.is_equivalent(q, by_chiral_id=False, by_chain=True): if p.funclocs != q.funclocs: new_res = p.copy(include_structure=True) new_res.funclocs = [p.funclocs[0], q.funclocs[0]] new_reslist.append(new_res) ignore.add(p.full_id) ignore.add(q.full_id) seen.add((p.full_id, q.full_id)) for p in reslist: if p.full_id not in ignore and p not in new_reslist: new_reslist.append(p) return new_reslist @staticmethod def _get_assembly_residues(reslist, parent_structure): """ Makes a new residue list of all equivalent residues found in identical assembly chains. Also applies an auth_resid correction where residues in identical chains might have a different auth_resid (usually of 1xxx or 2xxx for chains A and B Args: reslist: The residue list to be enriched parent_structure: BioPython Structure object of the parent structure Returns: An enriched list of residues with mapped structures. """ new_reslist = [] for res in reslist: res_structure = None for chain in parent_structure[0]: if res.chain != chain.get_id()[0]: continue # If we have a standard residue if res.is_standard: try: res_structure = chain[res.auth_resid] except KeyError: try: res_structure = chain[res.corrected_auth_resid] except KeyError: try: res_structure = chain[res.resid] except KeyError: continue if res_structure.resname != res.resname.upper(): continue # If we have a modified residue else: for _res in chain: if _res.get_id()[1] == res.auth_resid: res_structure = _res new_res = res.copy(include_structure=False) new_res.chain = chain.get_id() new_res.structure = res_structure new_reslist.append(new_res) return new_reslist @staticmethod def _get_seeds(reslist): """Finds residues in a list that can be used as seeds when building multiple active sites""" seeds = [] # Set a residue as reference ref = None for res in reslist: if res.auth_resid is None or res.structure is None: continue # Check if residue has any close neighbours -- If not, skip it skip = True for other in reslist: if res == other: continue try: if res.get_distance(other, kind='min') < 8: skip = False break except TypeError: continue if skip: continue ref = res break if ref is None: return seeds # Get all equivalents of ref residue and make them seeds for res in reslist: if res.is_equivalent(ref): if res.structure is None or res in seeds: continue seeds.append(res) return seeds @staticmethod def _get_nearest_equivalent(self, other, reslist, site): """Gets the closest equivalent of 'other' to 'self', if there are multiple equivalents in the residue list""" equivalents = [] for res in reslist: if res.structure is None: continue if res.is_equivalent(self): equivalents.append(res) result = None min_dist = 999 for eq in equivalents: #Check if the same residue is already in the site if site.contains_equivalent(eq): continue dist = eq.get_distance(other, kind='min') if dist < min_dist: result = eq min_dist = dist return result @staticmethod def _remove_redundant_sites(sitelist, cutoff=0): """Cleans a list of sites by removing duplicates or similar ones according to an RMSD cutoff""" seen = set() reject = set() for p in sitelist: if p.has_missing_functional_atoms or len(p) != len( p.reference_site): reject.add(p.id) continue for q in sitelist: if q.has_missing_functional_atoms or len(q) != len( q.reference_site): reject.add(q.id) continue if p.id == q.id or (q.id, p.id) in seen: continue seen.add((p.id, q.id)) _, _, _, rms = p.fit(q) if (p.has_identical_residues(q) and rms < 0.01) or rms < cutoff: reject.add(q.id) nr = [] for site in sitelist: if site.id not in reject and site not in nr: nr.append(site) return nr @staticmethod def _mark_unclustered(sitelist): """Cleans the list of catalytic sites from the same PDB by rejecting sites that might have insanely outlying residues""" try: ref_dists = sitelist[0].reference_site.get_distances(kind='min') ref_dists = np.nan_to_num(ref_dists, nan=999) ref_dists = np.where(ref_dists < 8, 8, ref_dists) except IndexError: return False for p in sitelist: p.is_sane = True p_dists = np.nan_to_num(p.get_distances(kind='ca'), nan=0) if not np.all((p_dists < 3 * ref_dists)): p.is_sane = False continue else: for q in sitelist: if p.id == q.id or q.is_sane == False: continue q_dists = np.nan_to_num(q.get_distances(kind='ca'), nan=999) q_dists = np.where(q_dists < 8, 8, q_dists) if not np.all((p_dists < 1.3 * q_dists)): p.is_sane = False return True @staticmethod def _super(p_coords, q_coords, cycles=1, cutoff=999, weighted=False, scaling_factor=None): sup = Superimposer() sup.set(p_coords, q_coords, cycles, cutoff, scaling_factor) if weighted: sup.run_weighted() else: sup.run_unweighted() return sup.rot, sup.tran, np.round(sup.rms, 3), np.round(sup.rms_all, 3) @staticmethod def _rmsd(p_coords, q_coords): """Calculates rmsd on two coordinate sets (NumPy arrays) WITHOUT transformation and minimization""" diff = np.square(np.linalg.norm(p_coords - q_coords, axis=1)) return np.sqrt(np.sum(diff) / diff.size) @staticmethod def _transform(coords, rot, tran): """Rotates and translates a set of coordinates (NxD NumPy array)""" return np.dot(coords, rot) + tran
class Disordered_Fragment(object): def __init__(self, start_pos=None, stop_pos=None, sequence=None): if start_pos: self.start_pos = start_pos #residue number of first disordered residue else: self.start_pos = 0 if stop_pos: self.stop_pos = stop_pos #residue number of last disordered residue else: self.stop_pos = 0 if sequence: self.sequence = sequence else: self.sequence = "" #sequence of disordered fragment self.radius = 1.0 #pseudoatoms radius self.max_sphere_radius = 10.0 #radius of sphere defining volume simulation area self.pseudoresidues = [] #list of residues objects self.fragment_type = "internal" #cterm/nterm/internal/simulated_volume self.fragment_lattice = None #lattice for disordered region structure (only residues, no atoms) def __str__(self): return "%s %s %s %s %s" % (self.start_pos, self.stop_pos, self.sequence, \ self.radius, self.fragment_type) def add_component_structure(self, struct): """ adds structure representing all component structure """ self.structure = struct def add_fragment_structure(self, fragment): """ adds piece of structure representing disordered region """ self.fragment_lattice = fragment def add_pseudoatoms_to_structure(self, pseudoatoms, moltype): """ """ start_index = 0 for pa in pseudoatoms: # print "***", start_index -1, len(list(self.fragment_lattice.get_residues())) self.add_pa_to_structure( pa, list(self.fragment_lattice.get_residues())[start_index], moltype) start_index += 1 def add_pa_to_structure(self, pa, resi, moltype): """ """ coord = array([pa.x, pa.y, pa.z]) #, "f") if moltype == "protein": new_atom = PyryAtom('CA', coord, 0, 1, ' ', ' CA', 1) else: new_atom = PyryAtom("C4'", coord, 0, 1, ' ', " C4'", 1) new_atom.assign_vdw() new_atom.assign_molweight() resi.add(new_atom) #print "Add PA to...", resi.id, new_atom.get_parent().id #@TODO needs testing!!!!! #@TODO need to renumber residues somehow!! def add_fragment_to_original_structure(self, component, structure, res_nr, fr_type): """ normal - in direction from n to c term reverse - in direction from c to n term (for addition of residues on Nterm) """ #@TODO will have to be changed when multichain components come!!! ##################################### #chain = list(self.structure.get_chains())[0] if structure: chain = list(structure.get_chains())[0] else: chain = list(component.pyrystruct.struct.get_chains())[0] residues = list(self.fragment_lattice.get_residues()) if fr_type == "nterm": residues.sort(key=lambda Residue: Residue.id[1], reverse=True) for resi in residues: #print "WANNA ADD: ", resi.id, res_nr, type resi.id = (" ", res_nr, " ") res_nr += 1 chain.add(resi) self.add_pseudoresidue(resi) def add_pseudoresidue(self, pr): """ adds pseudoresidue object to list of pseudoresidues """ self.pseudoresidues.append(pr) def build_structure(self, sequence, start_index, moltype): """ builds new structure composed of atom_name atoms only for nucleic acids these are C4', for proteins CA """ new_chain = list(self.fragment_lattice.get_chains())[0] for resi in sequence: #print "resi", resi, start_index resi_id = (" ", start_index, " ") #resi_name = resi ###############33 if moltype == "protein": resi_name = AMINOACIDS[resi.upper()] else: resi_name = NUCLEOTIDES[resi.upper()] ###############33 new_resi = Residue(resi_id, resi_name, " ") new_chain.add(new_resi) start_index += 1 def clean_fragment(self): self.fragment_lattice = None def remove_pseudoresidues(self, structure): """ removes pseudoresidues simulated during previous mutation in order to prepare conditions for new simulated pseudoresidues to be attached and scored """ ############################# ##self.structure into structure #@TODO must be changed when hybrids components will be considered remove = [] if self.fragment_type != "simulated_volume": if structure: chain = list(structure.get_chains())[0] for resi in chain: for r in self.pseudoresidues: if r.id[1] == resi.id[1]: remove.append(resi) #print "wanna detach", resi.id, len(list(self.structure.get_residues())) break for resi in remove: chain.detach_child(resi.id) self.pseudoresidues = [] #self.fragment_lattice = None def create_new_chain(self, id): """ """ self.fragment_lattice = Structure(id) my_model = Model(0) self.fragment_lattice.add(my_model) my_chain = Chain(id) my_model.add(my_chain) #what if more chains in one component? def create_simulated_volume(self, start_pos, fasta_seq, struct): """ method to create new instance of Disordered_Fragment class to represent regions with not assigned atom coordinates """ self.set_fragment_type("simulated_volume") self.create_new_chain(struct.chain) self.build_structure(fasta_seq, start_pos, struct.moltype) self.set_fragment_sequence(fasta_seq) self.get_pseudoatom_radius(struct) ##@TODO-CHECK: how to assess radius of simulation sphere?? self.calculate_max_sphere_radius(struct) ############################################33 #@TODO-CHECK: wouldn't one fragment type be enough?? def create_simulated_fragment(self, struct, fasta_seq): """ method to create set attributes of disordered fragment instance """ #@TODO-CHECK: calculate max sphere radius according to moltype and resi number!! self.get_pseudoatom_radius(struct) self.__check_fragment_type(fasta_seq) self.calculate_max_sphere_radius(struct) def __check_fragment_type(self, fasta_seq): """ from selection of: terminal internal """ ################333 #if len(fasta_seq) >30: #fragments longer than 30 residues are simulated as grapes. #################33 if self.stop_pos == len(fasta_seq): self.fragment_type = "cterm" elif self.stop_pos < len(fasta_seq) - 1 and self.start_pos > 1: self.fragment_type = "internal" if self.stop_pos - self.start_pos >= 30: self.fragment_type = "cterm" print "internal fragment simulated as GRAPE", self.start_pos, self.stop_pos elif self.start_pos == 1: self.fragment_type = "nterm" ########################################### def get_pseudoatom_radius(self, struct): """ radius is averaged distance between CA or C4' atoms in ideal helix/2 to represent volume of single CA/C4' atom """ if struct.moltype.lower() == "protein": self.radius = 1.9 #or 1.72 as C atom elif struct.moltype.upper() == "DNA": self.radius = 3.6 elif struct.moltype.upper() == "RNA": self.radius = 3.36 def calculate_max_sphere_radius(self, struct): """ returns average residue radius for a particular component type (in Angstrooms) radius given (3.8, 7.2, 6.72) is averaged distance between CA or C4' atoms in ideal helix """ if struct.moltype.lower() == "protein": self.__set_max_sphere_radius_for_moltype(3.8) elif struct.moltype.upper() == "DNA": self.__set_max_sphere_radius_for_moltype(7.2) elif struct.moltype.upper() == "RNA": self.__set_max_sphere_radius_for_moltype(6.72) def __set_max_sphere_radius_for_moltype(self, mol_radius): """ """ if self.fragment_type == "cterm" or self.fragment_type == "nterm": self.max_sphere_radius = (len(self.sequence) * mol_radius) / 2 elif self.fragment_type == "simulated_volume": self.max_sphere_radius = (len(self.sequence) * mol_radius) def set_max_sphere_radius(self, area_radius): """ sets radius of simulation area for Volume Simulator """ self.max_sphere_radius = area_radius def set_anchor_residues(self, resi1, resi2=None): """ defines first residue for volume simulator - here simulation should start if given defines last residue for volume simulator - here simulation should finish """ self.start_resi = resi1 self.end_resi = resi2 def set_modeling_disordered_fragment(self, component, struct, chain): """ #@TODO: podzial na 2 osobne funkcje: modelowanie fragmentow i modelowanie objetosci #@TODO: uporzadkowac simulate fragments!! dodac symulacje dla fragmentow srodkowych i nkoncowych #@TODO: sprawdzic numeracje dodawanych przez symulator reszt """ if self.fragment_type == "simulated_volume": #@TODO: must be changed into nicer way!! if struct: self.remove_pseudoresidues( struct ) # pyrystruct.struct#remove old Pseudoatoms positions else: self.remove_pseudoresidues(component.pyrystruct.struct) self.create_new_chain(chain) #pyrystruct.chain self.build_structure(self.sequence, self.start_pos, component.pyrystruct.moltype) else: if struct: self.remove_pseudoresidues(struct) else: self.remove_pseudoresidues(component.pyrystruct.struct) self.create_new_chain(chain) self.build_structure(self.sequence, self.start_pos, component.pyrystruct.moltype) #self.add_component_structure(pyrystruct.struct) def set_fragment_sequence(self, seq): """ sequence of disordered fragment """ self.sequence = seq def set_pseudoatom_radius(self, radius): """ sets pseudoatom radius; different for nucleotide and amino acids; in Angstroms """ self.radius = radius def set_fragment_type(self, frag_type): """ sets fragment type. Can be internal when disordered region is inside the component's structure or terminal when it is located on C or N termini """ self.fragment_type = frag_type
class PyRyStructure(object): """ class represents structure as entity (very wide definition) used for storing information about structures, creating BIO.pdb structures, saving structure files etc. """ def __init__(self, structure=None): if structure: self.struct = structure else: self.struct = None self.sequence = '' # sequence taken from structure #----------------will decide on one of these 3 ---------------------------- self.center_of_mass = [] # [x,y,z] coords of center of mass self.geometric_center = [] # geometric centre self.center = None # actual center of given complex component #-------------------------------------------------------------------------- self.chain = '' # chain name from structure file self.moltype = '' # protein, DNA, RNA def __str__(self): return "%s %s %s %s %s"%(self.struct, self.chain, self.center_of_mass,\ self.moltype, self.sequence) def add_chain_to_struct(self, chain_id): """ adds another model to BIO.pdb structure object Parameters: ----------- chain_id : chain name Returns: --------- self.struct : Bio.PDB structure with new chain """ chain = Chain(chain_id) self.struct[0].add(chain) def add_residues_to_structure(self, struct, chain_id, chain2_id): """ adds residues from struct to a given structure (self.structure) Parameters: ----------- struct : template structure object with residues which will be added to self.structure object chain_id : name of template chain chain2_id : name of new chain in self.struct Returns: --------- self.stuct : with extra residues """ residues = struct[0][chain_id].child_list [self.struct[0][chain2_id].add(res) for res in residues] def calculate_atom_atom_distance(self, atom1, atom2): """ calculates distance between two atoms Parameters: ----------- atom1, atom2 : Bio.PDB.Atom entities Returns: --------- distance from atom1 to atom2 in 3D space Raises: ------- PyRyStructureError if parameters are not Bio.PDB.Atom entities """ if is_structure(): return atom1 - atom2 def calculate_centre_of_mass(self, entity=None, geometric=False): """ calculates centre of mass for given structure Returns gravitic or geometric center of mass of an Entity. Geometric assumes all masses are equal (geometric=True) Defaults to Gravitic. Parameters: ----------- geometric : optional Returns: --------- centre of mass coordinates as [x,y,z] list Raises: ------- ValueError : if wrong object is given as a target PyRyStructureError : no PyRyStructure object """ #if self.struct == None: raise PyRyStructureError("You haven't provided \ # any structure to PyRyStructure class") if isinstance(self.struct, Entity.Entity): # Structure, Model, Chain, Residue atom_list = self.struct.get_atoms() elif hasattr(entity, '__iter__') and filter(lambda x: x.level ==\ 'A', entity): # List of Atoms atom_list = entity else: # Some other weirdo object raise ValueError('Center of Mass can only be calculated from \n\ the following objects:Structure, Model, Chain, Residue, list of Atoms.' ) new_centre = [0., 0., 0.] whole_mass = 0 for atom in atom_list: atom_centre = array([ float(atom.coord[0]), float(atom.coord[1]), float(atom.coord[2]) ]) whole_mass += atom.molweight new_centre += atom_centre * atom.molweight new_centre /= whole_mass self.center_of_mass = new_centre return self.center_of_mass def create_PDB_obj(self, id, filename): """ creates Bio.PDB object from pdb file Parameters: ----------- id : name of structure filename : file name """ parser = PDBParser() self.struct = parser.get_structure(str(id), filename) def create_new_structure(self, name, chain_id): """ creates new Bio.PDB structure object Parameters: ----------- name : structure name chain_id : chain name (e.g. A, B, C) Returns: --------- self.struct : Bio.PDB object with model and chain inside """ self.struct = Structure(name) my_model = Model(0) my_chain = Chain(chain_id) self.struct.add(my_model) self.struct[0].add(my_chain) def get_chainname(self): """ returns name of given structures chain """ self.chain = list(self.struct.get_chains())[0].id def get_mol_sequence(self): """ retrieves struct sequence as one letter code Parameters: ----------- self.struct : structure object Returns: --------- self.sequence : sequence of given structure in one letter code """ ##----must be included in tests!!!-------------------- for resi in self.struct.get_residues(): resi_name = resi.resname.strip().upper() #add one letter nucleotide names if len(resi_name) == 1 and resi_name in RESNAMES.values(): self.sequence += resi_name #add hetatms with modifications elif resi_name in to_one_letter_code: self.sequence += to_one_letter_code[resi_name] #do not add ions and ligands into sequence elif resi_name in LIGANDS: pass #if antyhing else appeared include as X else: self.sequence += "X" return self.sequence def get_moltype(self): """ based on component's sequence determines if a certain component is DNA, RNA or protein Raises: ------- PyRyStructureError if resnames are incorrect """ res = list(self.struct.get_residues())[0] if len(res.resname.strip()) == 3: if res.resname.strip() in AMINOACIDS.values(): self.moltype = 'protein' else: if res.resname.strip() in RESNAMES.keys(): pass else: raise PyRyStructureError("Wrong 3letter name", res.resname.strip()) else: for at in res: if at.fullname.strip() == "CA": self.moltype = 'protein' break elif at.fullname.strip() == "C4'" or at.fullname.strip( ) == "C4*": for atom in res.child_list: if atom.fullname.strip() == "O2'": self.moltype = "RNA" break if self.moltype == "": self.moltype = "DNA" return self.moltype def is_structure(self): """ checks if a given structure is Bio.PDB structure object Raises: ------ PyRyStructureError : if self.struct is not Bio.PDB object """ if isinstance(self.struct, Entity.Entity): # Structure, Model, Chain, Residue return True else: raise PyRyStructureError('%s should be one of\n\ the following objects:Structure, Model, Chain, Residue, \n\ list of Atoms.' % (self.struct)) def set_chain_name(self, chain): self.chain = chain def set_moltype(self, moltype): """ """ self.moltype = moltype def set_structure(self, struct): self.struct = struct def set_pyrystructure(self, structure=None): """ sets structure as PyRyStructure atrribute Parameters: ----------- structure : Bio.PDB structure object """ if self.struct == None: self.struct = structure if self.sequence == '': self.get_mol_sequence() self.get_chainname() self.get_moltype() def set_sequence(self, seq): """ """ self.sequence = seq def write_structure(self, filename): """ Writting structure to the pdb_file, saving changed coordinated Parameters: ----------- filename : final name of structure file """ out = PDBIO() out.set_structure(self.struct) out.save(filename)
chain = Chain("A") structure = Structure("ref") num_count = 0 for i in range(0,shape(points)[0]): num_count = num_count +1 res_id = (' ',num_count,' ') residue = Residue(res_id,'ALA',' ') cur_coord = tuple(points[i]) bfactor = bfactors[i] atom = Atom('CA',cur_coord,bfactor,0,' ','CA',num_count,'C') residue.add(atom) chain.add(residue) model.add(chain) structure.add(model) # -------------------------------------------------------------------- io=PDBIO() io.set_structure(structure) if ( args['dst'] is None): fn = sys.stdout io.save(fn) if ( args['link'] ): for i in range(1,shape(points)[0]): fn.write( "CONECT%5d%5d\n" % (i, i+1)) else: fn = args['dst'] io.save(fn) fout = open(fn,"a") if (args['link'] ): for i in range(1,shape(points)[0]):
def normalize_structure(input_path: str, pdb_id: str, model_id: int, chain_id: str, primary: str, mask: str, save=True, verbose=True): assert primary assert mask with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", PDBConstructionWarning) parser = PDBParser() structure = parser.get_structure(pdb_id, input_path) if not model_id in structure.child_dict: try_model_id = model_id - 1 model = None while try_model_id >= 0: if try_model_id in structure.child_dict: model = structure.child_dict[try_model_id] if verbose: print('Supposing model {} is {}...'.format( model_id - 1, model_id)) try_model_id -= 1 if not model: raise ValueError( 'model "{}" not found in "{}", options are {}'.format( model_id, pdb_id, list(structure.child_dict.keys()))) else: model = structure.child_dict[model_id] if not chain_id in model.child_dict: raise ValueError( 'chain "{}" not found in "{}" model "{}", options are {}'. format(chain_id, pdb_id, model_id, list(model.child_dict.keys()))) chain = model.child_dict[chain_id] new_chain = normalize_chain(chain) raw = [] for residue in chain: try: raw.append(resname_to_abbrev(residue.resname)) except UnknownResnameError: # if verbose: # print('Skipping residue "{}"'.format(residue.resname)) pass raw = ''.join(raw) # verify that the sequence is what we expect normalized = [] for residue in new_chain: try: normalized.append(resname_to_abbrev(residue.resname)) except UnknownResnameError: # if verbose: # print('Skipping residue "{}"'.format(residue.resname)) pass normalized = ''.join(normalized) # extract the known primary sequence using the mask masked_primary = [] for r, m in zip(primary, mask): if m == '-': continue assert m == '+' masked_primary.append(r) masked_primary = ''.join(masked_primary) # ensure the sequence lengths match if len(normalized) != len(masked_primary): raise ChainLengthError(len(normalized), len(masked_primary)) # ensure residue identities match for i, (got, expected) in enumerate(zip(normalized, masked_primary)): if got != expected: raise ValueError( 'mismatch residue at position {} (got {}, expected {})'. format(i, got, expected)) new_model = Model(model.id) new_model.add(new_chain) new_structure = Structure(structure.id) new_structure.add(new_model) if save: out_path = input_path + '.norm' io = PDBIO() io.set_structure(new_structure) io.save(out_path) return out_path else: return new_structure
if atom.coord[0] < atom2.coord[0]: atom3.coord[0] += xDistancePerStep elif atom.coord[0] > atom2.coord[0]: atom3.coord[0] -= xDistancePerStep if atom.coord[1] < atom2.coord[1]: atom3.coord[1] += yDistancePerStep elif atom.coord[1] > atom2.coord[1]: atom3.coord[1] -= yDistancePerStep if atom.coord[2] < atom2.coord[2]: atom3.coord[2] += zDistancePerStep elif atom.coord[2] > atom2.coord[2]: atom3.coord[2] -= zDistancePerStep yield newModel if startEndInclusive: final.id = steps + 1 yield final modelFrame = 0 for model in interpolate(structure[0], structure[1], 10, True): result = Structure('result') result.add(model) io = PDBIO() io.set_structure(result) io.save('frames/out_' + str(modelFrame) + '.pdb') modelFrame += 1
}, { 'name': 'C5', 'coord': PDB.Atom.array([66.402, 44.364, 11.291], 'f'), 'bfactor': 44.20, 'occupancy': 1.0, 'altloc': ' ', 'fullname': 'C5', 'serial_number': 7 }, { 'name': 'C6', 'coord': PDB.Atom.array([65.095, 44.589, 11.192], 'f'), 'bfactor': 44.33, 'occupancy': 1.0, 'altloc': ' ', 'fullname': 'C6', 'serial_number': 8 }] my_structure.add(my_model) my_model.add(my_chain) my_chain.add(my_residue) for atom in atoms: my_atom = Atom(atom['name'], atom['coord'], atom['bfactor'], atom['occupancy'], atom['altloc'], atom['fullname'], atom['serial_number']) my_residue.add(my_atom) out = PDBIO() out.set_structure(my_structure) out.save('my_new_structure.pdb')
structure = Structure(refid) model_ref = Model(1) chain_ref = Chain("A") points_ref = ReadXYZ(ref_ptsfilename,scale) num_count = 0 for i in range(0,shape(points_ref[IndexList])[0]): num_count = num_count +1 res_id = (' ',num_count,' ') residue = Residue(res_id,'ALA',' ') cur_coord = tuple(points_ref[IndexList[i]]) atom = Atom('CA',cur_coord,0,0,' ',num_count,num_count,'C') residue.add(atom) chain_ref.add(residue) model_ref.add(chain_ref) structure.add(model_ref) #-------------------------------------------------------------------- altid = "alt" structure_alt = Structure(refid) model_alt = Model(2) chain_alt = Chain("A") points_alt = ReadXYZ(alt_ptsfilename,scale) num_count = 0 for i in range(0,shape(points_alt[IndexList])[0]): num_count = num_count +1 res_id = (' ',num_count,' ') residue = Residue(res_id,'ALA',' ') cur_coord = tuple(points_alt[IndexList[i]]) atom = Atom('CA',cur_coord,0,0,' ',num_count,num_count,'C')
if atom.coord[0] < atom2.coord[0]: atom3.coord[0] += xDistancePerStep elif atom.coord[0] > atom2.coord[0]: atom3.coord[0] -= xDistancePerStep if atom.coord[1] < atom2.coord[1]: atom3.coord[1] += yDistancePerStep elif atom.coord[1] > atom2.coord[1]: atom3.coord[1] -= yDistancePerStep if atom.coord[2] < atom2.coord[2]: atom3.coord[2] += zDistancePerStep elif atom.coord[2] > atom2.coord[2]: atom3.coord[2] -= zDistancePerStep yield newModel if startEndInclusive: final.id = steps + 1 yield final for model in interpolate(initial[0], final[0], 10, True): result.add(deepcopy(model)) io = PDBIO() io.set_structure(result) io.save("out.pdb")
def visualize_2DA(apo_2DA, holo_2DA, paper_apo_spans): """ Writes superimposed holo structure to a file, prints Pymol script which can be directly pasted in pymol. Printed Pymol script will: 1) automatically load both structures (superimposed holo from filesystem, apo from the internet) 2) create objects and selections for domains, and the two-domain arrangements 3) color the selections by domain, apo/holo and paper/ours - colors - ours more saturation, paper faded - red, yellow apo (first and second domain respectively) - green, blue holo 4) provide example usage in the last script paragraph """ # load the structure from file a = parse_mmcif(apo_2DA.pdb_code) h = parse_mmcif(holo_2DA.pdb_code) apo = a.structure holo = h.structure ###### vlozene z mainu apo_mapping = a.bio_to_mmcif_mappings[0][apo_2DA.d1.chain_id] holo_mapping = h.bio_to_mmcif_mappings[0][holo_2DA.d1.chain_id] # crop polypeptides to longest common substring c1_common_seq, c2_common_seq = get_longest_common_polypeptide(a.poly_seqs[apo_mapping.entity_poly_id], h.poly_seqs[holo_mapping.entity_poly_id]) c1_label_seq_ids = list(c1_common_seq.keys()) c2_label_seq_ids = list(c2_common_seq.keys()) label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0] ###### end vlozene # get residues of the first domain, in both apo and holo structures apo_d1 = DomainResidues.from_domain(apo_2DA.d1, apo[0], apo_mapping) holo_d1 = DomainResidues.from_domain(holo_2DA.d1, holo[0], holo_mapping) # superimpose holo onto apo, using the first domain superimposed_holo_model = superimpose_structure(holo[0], holo_d1, apo_d1) # save the structure name = holo.id + f'_{holo_d1.domain_id}onto_{apo_d1.domain_id}' io = MMCIFIO() superimposed_holo = Structure(name) superimposed_holo.add(superimposed_holo_model) io.set_structure(superimposed_holo) sholo_file_path = Path(OUTPUT_DIR, name + '.cif') io.save(str(sholo_file_path), preserve_atom_numbering=True) def get_resi_selection(spans): selection = [] for from_, to in spans: selection.append(f'resi {from_}-{to}') return '(' + ' or '.join(selection) + ')' # convert paper spans to label seqs, so we can show them in Pymol def get_paper_domain(d: DomainResidueMapping, paper_spans, residue_id_mapping): # translate spans to label seq ids and return a domain object segment_beginnings = list(map(residue_id_mapping.find_label_seq, np.array(paper_spans)[:, 0].tolist())) segment_ends = list(map(residue_id_mapping.find_label_seq, np.array(paper_spans)[:, 1].tolist())) logger.debug(segment_beginnings) logger.debug(segment_ends) return DomainResidueMapping(d.domain_id, d.chain_id, segment_beginnings, segment_ends) logger.debug(paper_apo_spans) # [d1, d2] where d1 [(), (),...] paper_apo_drm1 = get_paper_domain(apo_2DA.d1, paper_apo_spans[0], apo_mapping) paper_apo_drm2 = get_paper_domain(apo_2DA.d2, paper_apo_spans[1], apo_mapping) label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0] paper_holo_drm1 = DomainResidueMapping.from_domain_on_another_chain(paper_apo_drm1, holo_d1.chain_id, label_seq_id_offset) paper_holo_drm2 = DomainResidueMapping.from_domain_on_another_chain(paper_apo_drm2, holo_d1.chain_id, label_seq_id_offset) # same chain, for now, as in d1 # create highlight script (by the spans, or just create multiple selections) # copy the 2 structures to 4 (paper spans vs our spans), so we can color them differently # select only the domains (2), and make only them visible sholo = superimposed_holo pymol_script = f""" fetch {apo.id} load {sholo_file_path.absolute()} sele apo_d1, {apo.id} and chain {apo_2DA.d1.chain_id} and {get_resi_selection(apo_2DA.d1.get_spans())} sele apo_d2, {apo.id} and chain {apo_2DA.d2.chain_id} and {get_resi_selection(apo_2DA.d2.get_spans())} sele apo_2DA, apo_d1 or apo_d2 sele holo_d1, {sholo.id} and chain {holo_2DA.d1.chain_id} and {get_resi_selection(holo_2DA.d1.get_spans())} sele holo_d2, {sholo.id} and chain {holo_2DA.d2.chain_id} and {get_resi_selection(holo_2DA.d2.get_spans())} sele holo_2DA, holo_d1 or holo_d2 # copy objects, so we can color them differently copy paper_{apo.id}, {apo.id} copy paper_{sholo.id}, {sholo.id} sele paper_apo_d1, paper_{apo.id} and chain {apo_2DA.d1.chain_id} and {get_resi_selection(paper_apo_drm1.get_spans())} sele paper_apo_d2, paper_{apo.id} and chain {apo_2DA.d2.chain_id} and {get_resi_selection(paper_apo_drm2.get_spans())} sele paper_apo_2DA, paper_apo_d1 or paper_apo_d2 sele paper_holo_d1, paper_{sholo.id} and chain {holo_2DA.d1.chain_id} and {get_resi_selection(paper_holo_drm1.get_spans())} sele paper_holo_d2, paper_{sholo.id} and chain {holo_2DA.d2.chain_id} and {get_resi_selection(paper_holo_drm2.get_spans())} sele paper_holo_2DA, paper_holo_d1 or paper_holo_d2 color red, apo_d1 color yellow, apo_d2 color green, holo_d1 color blue, holo_d2 color salmon, paper_apo_d1 color paleyellow, paper_apo_d2 color palegreen, paper_holo_d1 color lightblue, paper_holo_d2 # example usage: hide; show surface, apo_2DA hide; show surface, paper_apo_2DA hide; show surface, holo_2DA hide; show surface, paper_holo_2DA hide; show surface, apo_2DA or holo_2DA or paper_apo_2DA or paper_holo_2DA """ print(pymol_script)