def get_structure(self): """Get the pdb structure of the molecule.""" # we can have a str or a list of bytes as input if isinstance(self.pdb_data, str): self.complex = freesasa.Structure(self.pdb_data) else: self.complex = freesasa.Structure() atomdata = self.sql.get('name,resName,resSeq,chainID,x,y,z') for atomName, residueName, residueNumber, chainLabel, x, y, z in atomdata: atomName = '{:>2}'.format(atomName[0]) self.complex.addAtom(atomName, residueName, residueNumber, chainLabel, x, y, z) self.result_complex = freesasa.calc(self.complex) self.chains = {} self.result_chains = {} for label in self.chains_label: self.chains[label] = freesasa.Structure() atomdata = self.sql.get('name,resName,resSeq,chainID,x,y,z', chainID=label) for atomName, residueName, residueNumber, chainLabel, x, y, z in atomdata: atomName = '{:>2}'.format(atomName[0]) self.chains[label].addAtom(atomName, residueName, residueNumber, chainLabel, x, y, z) self.result_chains[label] = freesasa.calc(self.chains[label])
def featurize(structure: Structure) -> list[Any]: """ Calculates 3D ML features from the `structure`. """ structure1 = freesasa.Structure(pdbpath) result = freesasa.calc(structure1) area_classes = freesasa.classifyResults(result, structure1) Total_area = [] Total_area.append(result.totalArea()) Polar_Apolar = [] for key in area_classes: # print( key, ": %.2f A2" % area_classes[key]) Polar_Apolar.append(area_classes[key]) # get all the residues residues = [res for res in structure.get_residues()] seq_length = [] seq_length.append(len(residues)) # calculate some random 3D features (you should be smarter here!) protein_length = residues[1]["CA"] - residues[-2]["CA"] angle = calc_dihedral( residues[1]["CA"].get_vector(), residues[2]["CA"].get_vector(), residues[-3]["CA"].get_vector(), residues[-2]["CA"].get_vector(), ) # create the feature vector features = [Total_area, Polar_Apolar, protein_length, seq_length, angle] return features
def get_area(this_run,basename): path_dictionary=setup_paths() outpath = path_dictionary["pdb_path"] + basename + '.pdb' print('getting area') # convert to pdb obConversion = openbabel.OBConversion() obConversion.SetInFormat("xyz") obConversion.SetOutFormat("pdb") OBMol = openbabel.OBMol() obConversion.ReadFile(OBMol, this_run.init_geopath) obConversion.WriteFile(OBMol, outpath) # measure free SA dc = DerivedClassifierT() myopt = {'halt-at-unknown': False, 'hetatm': True, 'hydrogen': True, 'join-models': False, 'skip-unknown': False} structure = freesasa.Structure(outpath,classifier = dc, options = myopt) structure.setRadiiWithClassifier(dc) result = freesasa.calc(structure).totalArea() this_run.area = result
def run(self, pdb): """Run freesasa on provided PDB file Parameters ---------- pdb: str Path to input PDB file Returns ------- list SASA values for each atom of every model in the input PDB. """ structure_array = freesasa.structureArray(bytes(pdb, 'utf-8'), options=self.options, classifier=self.classifier) results = [] for s in structure_array: print('Computing SASA for each model/frame') result = freesasa.calc(s) atom_areas = [result.atomArea(ndx) for ndx in range(s.nAtoms())] results.append(atom_areas) return results
def sa_calc(polymer_pdb, radius): # pdb files are needed for calculation surface area mol_file = Chem.MolFromMolFile(polymer_pdb) # hydrogens are removed in the mol file pdb_file = Chem.AddHs(mol_file, addCoords = True) # convert mol file to pdb file in rdkit Chem.MolToPDBFile(pdb_file, out_dir+NAME+'_new.pdb') # hydrogens are removed in the default option option_with_Hs = { 'hetatm' : True, 'hydrogen' : True, 'join-models' : False, 'skip-unknown' : False, 'halt-at-unknown' : False } # calculate solvent accessible surface area(probe radius = 1.4 Å or 3.6 Å) para = freesasa.Parameters() freesasa.Parameters.setProbeRadius(para, radius) # calculate sa for different type of polymers free_struct = freesasa.Structure(out_dir+NAME+'_new.pdb', options = option_with_Hs) free_calc = freesasa.calc(free_struct, para) total = free_calc.totalArea() # round to 4 decimals decimal = round(total, 4) print (f'Total SASA is {decimal} Å^2 when probe radius is {radius} Å.') atom_number = mol_file.GetNumAtoms() normalized_sa = round(decimal / atom_number, 4) # save data to a txt file with open (out_dir + 'Average surface area.txt', 'a+') as Asa: Asa.write(f'The normalized surface area of {NAME} is ' + str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.\n' ) print ('Nomalized solvent accessible surface area is '+ str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.\n')
def calculate_sasa(pdb_file): # Read PDB structure atoms, residues, chains = parse_complex_from_file(pdb_file) molecule = Complex(chains, atoms, structure_file_name=pdb_file) parameters = DesolvationParameters() # Lightdock structure to freesasa structure structure = Structure() for atom in molecule.atoms: structure.addAtom(atom.name, atom.residue_name, atom.residue_number, atom.chain_id, atom.x, atom.y, atom.z) atom_names = [] atom_radius = [] for atom in molecule.atoms: atom_names.append("%-4s" % atom.name) if atom.residue_name == 'CYX': atom.residue_name = 'CYS' atom_radius.append(parameters.radius_per_atom[atom.residue_name + "-" + atom.name]) structure.setRadii(atom_radius) start_time = timeit.default_timer() result = freesasa.calc(structure) elapsed = timeit.default_timer() - start_time return result.totalArea(), elapsed
def get_area_classes(file): struct = freesasa.Structure(file) result = freesasa.calc(struct) area_classes = freesasa.classifyResults(result, struct) list_areas = [(list(area_classes.values())[0]), (list(area_classes.values())[1]), result.totalArea()] return list_areas
def execute_freesasa_api(structure): """ Calls freesasa using its Python API and returns per-residue accessibilities. """ try: from freesasa import Classifier, structureFromBioPDB, calc except ImportError as err: print( '[!] The binding affinity prediction tools require the \'freesasa\' Python API', file=sys.stderr) raise ImportError(err) asa_data, rsa_data = {}, {} _rsa = rel_asa['total'] config_path = os.environ.get( 'FREESASA_PAR', pkg_resources.resource_filename('prodigy', 'naccess.config')) classifier = Classifier(config_path) pkg_resources.cleanup_resources() # classifier = freesasa.Classifier( os.environ["FREESASA_PAR"]) # Disable with stdchannel_redirected(sys.stderr, os.devnull): try: struct = structureFromBioPDB( structure, classifier, ) result = calc(struct) except AssertionError as e: error_message = '\n[!] Error when running freesasa: \n[!] {}'.format( e) print(error_message) raise Exception(error_message) # iterate over all atoms to get SASA and residue name for idx in range(struct.nAtoms()): atname = struct.atomName(idx) resname = struct.residueName(idx) resid = struct.residueNumber(idx) chain = struct.chainLabel(idx) at_uid = (chain, resname, resid, atname) res_uid = (chain, resname, resid) asa = result.atomArea(idx) asa_data[at_uid] = asa # add asa to residue rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa # convert total asa ro relative asa rsa_data.update( (res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items()) return asa_data, rsa_data
def calculate_SAS(temp_dict, pdb_path, seq_len): struct = freesasa.Structure(str(pdb_path)) result = freesasa.calc(struct) area_classes = freesasa.classifyResults(result, struct) polar = area_classes['Polar'] apolar = area_classes['Apolar'] sasa_fraction = (polar + apolar) / seq_len temp_dict.update({ "Polar": polar, "Apolar": apolar, "SASA Fraction": sasa_fraction })
def calcSASA(Latm, selection): """Calcule la surface accessible au solvent (SAS) des acides aminés de la selecion Retourne la SAS pour une sélection donnée """ freesasa.setVerbosity(1) structure = freesasa.Structure() for a in Latm: structure.addAtom(a.ty, a.resname, a.resN, a.chain, a.traj[0], a.traj[1], a.traj[2]) result = freesasa.calc(structure) selections = freesasa.selectArea((selection, 'all, resn ala'), structure, result) return selections[selection.split()[0][:-1]]
def cal_sasa(prot, resilist): structure = freesasa.Structure(prot) result = freesasa.calc(structure) for i in range(len(resilist)): resi_ind = resilist[i]['resi_seq'] chain = resilist[i]['chain'] sasa_value = freesasa.selectArea( ('alanine, resn ala', 'we, resi ' + str(resi_ind) + ' and chain ' + chain), structure, result) resilist[i]['SASA'] = sasa_value['we'] return resilist
def run_freesasa_biopython(pdb_path): global freesasa if freesasa is None: try: import freesasa except ImportError: raise RuntimeError("Cannot use this method. Please save the pdb file and rerun with docker") with silence_stdout(), silence_stderr(): #Automatically removes hydrogens sasa_struct = freesasa.Structure(pdb_path) sasa = freesasa.calc(sasa_struct) return sasa, sasa_struct
def _compute_asa(df): """Compute solvent-accessible surface area for provided strucutre.""" bp = dt.df_to_bp(df) structure = freesasa.Structure( classifier=freesasa.Classifier.getStandardClassifier('naccess'), options={ 'hydrogen': True, 'skip-unknown': True }) for i, atom in df.iterrows(): if atom['resname'] != 'UNK' and atom['element'] != 'H': structure.addAtom(atom['name'], atom['resname'], atom['residue'], atom['chain'], atom['x'], atom['y'], atom['z']) result = freesasa.calc(structure) return result.totalArea()
def _get_sasa(self): if freesasa is None: print "SASA not installed! SASA will be 0" return None, None if self.sasa is None: pdbfd, tmp_pdb_path = tempfile.mkstemp() with os.fdopen(pdbfd, 'w') as tmp: writePDBStream(tmp, self.structure) with silence_stdout(), silence_stderr(): self.sasa_struct = freesasa.Structure(tmp_pdb_path) self.sasa = freesasa.calc(self.sasa_struct) os.remove(tmp_pdb_path) return self.sasa, self.sasa_struct
def get_attributes(self): # read pdb file with open(self.file_path, "r") as f: self.data = f.readlines() # calculate solvent access data try: self.solvent_access = fs.calc(fs.Structure(self.file_path)) except Exception: raise self._clean_data() try: self._ca_attributes() except AssertionError: raise self._distance_to_others() self._find_in_range()
def run(self, residues: SetOfResidues) -> float: # attach method get_atoms used by freesasa's BioPython binding (so that it behaves like BioPython's Entity) def get_atoms(self): for r in self: for atom in r.get_atoms(): if atom.element != 'H': # otherwise freesasa somehow crashes with: AssertionError: Error: Radius array is <= 0 for the residue: PHE ,atom: H yield atom # freesasa calls get_atoms on the passed object, so add that method to `residues` bound_method = get_atoms.__get__(residues) object.__setattr__( residues, 'get_atoms', bound_method) # setting to a _frozen_ dataclass (SetOfResidues) # use freesasa to compute SASA sasa_structure = freesasa.structureFromBioPDB(residues) result = freesasa.calc(sasa_structure) return result.totalArea()
def sasa_from_file(file: Union[str, pathlib.Path]) -> Sasa: """Get the freesasa.Result.residueAreas() dictionary obtained after parsing a PDB file to a freesasa.Structure and calling fresasa.calc() on it. """ if isinstance(file, str): file = pathlib.Path(file) elif isinstance(file, pathlib.Path): pass else: raise TypeError( "Invalid argument type. File should be 'str' or pathlib.Path") if not file.exists(): raise FileNotFoundError( f"File {file.absolute().as_posix()} does not exist.") _struct = freesasa.Structure(file.absolute().as_posix()) _sasa = freesasa.calc(_struct) return ObjDict(_sasa.residueAreas())
def getAtomSASA(structure, classifier=None, probe_radius=1.4, mi=0, **kwargs): if(classifier is None): # initialize new classifier classifier = Radius(**kwargs) freesasa_structure = getFreeSASAStructureFromModel(structure, classifier=classifier) SASA = freesasa.calc(freesasa_structure, freesasa.Parameters({"probe-radius": probe_radius})) # get atom SASA N = structure.nAtoms() for i in range(N): sasa = SASA.atomArea(i) resi = freesasa_structure.residueNumber(i).strip() cid = freesasa_structure.chainLabel(i).strip() if(resi[-1].isdigit()): ins = " " else: ins = resi[-1] resi = resi[:-1] aname = structure.atomName(i).strip() structure[mi][cid][(' ', int(resi), ins)][aname].xtra["sasa"] = sasa
def _get_scores(self, df, pdb_id, pdb_chain): sifts = get_sifts_alignment_for_chain(pdb_id, pdb_chain, self.sifts_directory, self.download_sifts) if sifts is None: scores = None else: df = pd.merge(df, sifts, left_on='residue', right_on='uniprot position', how='left') pdb_file_path = os.path.join(self.pdb_directory, pdb_id + '.pdb') if not os.path.isfile(pdb_file_path): # PDB file not already downloaded. if self.download_pdb_file: download_pdb_file(pdb_id, self.pdb_directory) else: raise LookupError( "PDB file {} is not in the pdb_directory {}".format( pdb_id, self.pdb_directory)) structure = freesasa.Structure(pdb_file_path) result = freesasa.calc(structure, self.freesasa_parameters) chain_results = result.residueAreas()[pdb_chain] scores = np.full(len(df), np.nan) for i, residue in enumerate(df['pdb position']): if not np.isnan(residue): try: scores[i] = getattr(chain_results[str(int(residue))], self.metric) except KeyError as e: pass return scores
def handle(self, *args, **options): # grab PDB pdb_code = options.get('pdb_code', None).upper() reference = Structure.objects.get(pdb_code__index=pdb_code) #.prefetch_related('pdb_data') preferred_chain = reference.preferred_chain.split(',')[0] # read pdb structure (from RCSB) using Biopython structure = self.load_pdb_var(pdb_code,reference.pdb_data.pdb) # get preferred chain for PDB-code # grab residues with the generic numbering for this structure db_reslist = list(Residue.objects.exclude(generic_number__isnull=True).filter(protein_conformation__protein=reference.protein_conformation.protein).prefetch_related('generic_number')) ####################################################################### ############################# filter pdb ############################# os.chdir("pymol_output") db_tmlist = [[] for i in range(7)] db_set = set() db_set_p = set() oldr = False for r in db_reslist: if r.generic_number.label[:2] in ["1x","2x","3x","4x","5x","6x","7x"]: db_tmlist[int(r.generic_number.label[0])-1].append(r.sequence_number) db_set.add((' ',r.sequence_number,' ')) db_set_p.add((' ',r.sequence_number,' ')) lastin = True if oldr: db_set_p.add((' ',oldr.sequence_number,' ')) oldr = False else: oldr = r if lastin: db_set_p.add((' ',oldr.sequence_number,' ')) lastin=False def recurse(entity,slist): for subenty in entity.get_list(): if not subenty.id in slist[0]: entity.detach_child(subenty.id) elif slist[1:]: recurse(subenty, slist[1:]) recurse(structure,[[0], preferred_chain]) hse_struct = deepcopy(structure) recurse(structure, [[0], preferred_chain, db_set]) pchain = structure[0][preferred_chain] ####################################################################### ############### Calculate the axes through the helices ################ ####################################################################### N = 3 hres_list = [np.asarray([pchain[r]["CA"].get_coord() for r in sl], dtype=float) for sl in db_tmlist] h_cb_list = [np.asarray([pchain[r]["CB"].get_coord() if "CB" in pchain[r] else np.array([None,None,None]) for r in sl], dtype=float) for sl in db_tmlist] # fast and fancy way to take the average of N consecutive elements hres_three = np.asarray([sum([h[i:-(len(h) % N) or None:N] for i in range(N)])/N for h in hres_list]) helices_mn = np.asarray([np.mean(h, axis=0) for h in hres_three ]) self.save_pseudo(hres_three, pdb_code+"helper") ####################################################################### ################################# PCA ################################# ####################################################################### def pca_line(pca,h, r=0): if ((not r) if pca.fit_transform(h)[0][0] < 0 else r): return pca.inverse_transform(np.asarray([[-20,0,0],[20,0,0]])) else:return pca.inverse_transform(np.asarray([[20,0,0],[-20,0,0]])) helix_pcas = [PCA() for i in range(7)] pos_list = np.asarray([pca_line(helix_pcas[i], h,i%2) for i,h in enumerate(hres_three)]) self.write_cgo_arrow_pml(pdb_code, "pca",pos_list) pos_list = np.mean(pos_list,axis=0) self.write_cgo_arrow_pml(pdb_code, "pca_mean",[pos_list]) pca = PCA() pos_list = pca_line(pca, np.vstack(hres_three)) self.write_cgo_arrow_pml(pdb_code, "pca_all",[pos_list]) pos_list = np.asarray([pca_line(PCA(), h[:len(h)//2:(-(i%2) or 1)]) for i,h in enumerate(hres_three)]) pos_list = pos_list - (np.mean(pos_list,axis=1)-helices_mn).reshape(-1,1,3) self.write_cgo_arrow_pml(pdb_code, "pca_extra",pos_list) self.write_cgo_arrow_pml(pdb_code, "pca_extra_mean",[np.mean(pos_list,axis=0)]) pca_extra = PCA() pos_list = pca_line(pca_extra, np.vstack(pos_list)) self.write_cgo_arrow_pml(pdb_code, "pca_extra_pca",[pos_list]) ####################################################################### ################################ Angles ############################### ####################################################################### def calc_angle(b,c): ba = -b bc = c + ba ba[:,0] = 0 return np.degrees(np.arccos(inner1d(ba, bc) / (np.linalg.norm(ba,axis=1) * np.linalg.norm(bc,axis=1)))) def ca_cb_calc(i,pca): fin = np.isfinite(h_cb_list[i][:,0]) return calc_angle(pca.transform(hres_list[i][fin]),pca.transform(h_cb_list[i][fin])) def axes_calc(i,pca_list,pca): p = pca_list[i] h = hres_list[i] a = (np.roll(np.vstack((h,h[0])),1,axis=0)[:-1] + h + np.roll(np.vstack((h,h[-1])),-1,axis=0)[:-1])/3 b = p.transform(h) b[:,1:] = p.transform(a)[:,1:] b = p.inverse_transform(b) return calc_angle(pca.transform(b),pca.transform(h)) def set_bfactor(structure,angles): for r,an in zip(structure[0][preferred_chain].get_list(),angles): for a in r: a.set_bfactor(an) centerpca = pca ########################### Axis to CA to CB ########################## tv = np.isfinite(np.concatenate(h_cb_list)[:,0]) angle = np.full_like(tv,-1,dtype=float) angle[tv] = np.concatenate([ca_cb_calc(i,centerpca) for i in range(TMNUM)]) set_bfactor(structure,angle) self.save_pdb(structure, pdb_code+'angle_colored_ca_cb.pdb') ######################### Axis to Axis to CA ########################## angle2 = np.concatenate([axes_calc(i,helix_pcas,centerpca) for i in range(TMNUM)]) set_bfactor(structure,angle2) self.save_pdb(structure, pdb_code+'angle_colored_axes.pdb') ########################### HSE and ASA ############################### # res, dic = freesasa.calcBioPDB(orig_structure) pdbstruct = freesasa.Structure(pdb_code+'angle_colored_axes.pdb') res = freesasa.calc(pdbstruct) # print(res.nAtoms()) # [print(res.atomArea(a)) for a in range(res.nAtoms())] # print() # print(sum([res.atomArea(a) for a in range(res.nAtoms())])) # print(len(list(orig_structure[0].get_atoms()))) # print(res.nAtoms()) asa_list = [] oldnum = -1 for i in range(res.nAtoms()): resnum = pdbstruct.residueNumber(i) if resnum == oldnum: asa_list[-1] += res.atomArea(i) else: asa_list.append(res.atomArea(i)) oldnum = resnum set_bfactor(structure,asa_list) self.save_pdb(structure, pdb_code+'asa_colored.pdb') # Calculate HSEalpha model = hse_struct[0] exp_ca = pdb.HSExposure.HSExposureCA(model) print(len(exp_ca)) [[a.set_bfactor(x[1][1]) for a in x[0]] for x in exp_ca] recurse(hse_struct, [[0], preferred_chain, db_set]) r = [x[0] for x in exp_ca] #x = model["A"].get_list() x = pchain.get_list() for r in (set(x) - set(r)): for a in r: a.set_bfactor(-1) exp_ca = [a["CA"].get_bfactor() for a in hse_struct[0][preferred_chain].get_list()] # print(set(x) - set(r)) # print(len(set(x) - set(r))) # print(db_set_p - db_set) self.save_pdb(hse_struct, pdb_code+'hsea_colored.pdb')
def surface_list(file1): maximum_area = { 'ALA': 120.56, 'CYS': 143.79, 'ASP': 157.04, 'GLU': 188.42, 'PHE': 227.46, 'GLY': 89.41, 'HIS': 200.14, 'ILE': 96.42, 'LYS': 213.74, 'LEU': 206.32, 'MET': 216.63, 'ASN': 149.85, 'PRO': 155.07, 'GLN': 186.83, 'ARG': 229.51, 'SER': 128.27, 'THR': 138.58, 'VAL': 169.82, 'TRP': 269.35, 'TYR': 241.54 } global chain_A global chain_B surface_list_a1 = [] surface_list_b1 = [] structure = freesasa.Structure(file1) result = freesasa.calc(structure) for residue1 in chain_A.get_residues(): try: res_id = residue1["CA"].get_full_id()[3][1] select_word = str(res_id) + ", " + "chain H and resi " + str( res_id) selections = freesasa.selectArea((select_word, ), structure, result) for key in selections: if float('%.3f' % (selections[key] / maximum_area[chain_A[ residue1.get_full_id()[3][1]].get_resname()])) > 0.05: surface_list_a1.append(res_id) except Exception: pass continue for residue2 in chain_B.get_residues(): try: res_id = residue2["CA"].get_full_id()[3][1] select_word = str(res_id) + ", " + "chain L and resi " + str( res_id) selections = freesasa.selectArea((select_word, ), structure, result) for key in selections: if float('%.3f' % (selections[key] / maximum_area[chain_B[ residue2.get_full_id()[3][1]].get_resname()])) > 0.05: surface_list_b1.append(res_id) except Exception: pass continue return surface_list_a1, surface_list_b1
def sa_conformers(file_1, func_1, file_2, func_2, units, radius): # turn off cache stk.OPTIONS['cache'] = False # number of conformers N = 10 """ functional groups: ['diol'] and ['dibromine']/['difluorene'] or ['bromine'] and ['bromine']/['iodine'] """ name_1 = file_1.replace('.mol', '') unit_1 = stk.StructUnit2(file_1, func_1) name_2 = file_2.replace('.mol', '') unit_2 = stk.StructUnit2(file_2, func_2) # make polymer NAME = name_1+'_'+name_2+'_AB_poly' print(f'Creating polymer: {NAME}') polymer = stk.Polymer([unit_1, unit_2], stk.Linear('AB', [0, 0], n=units, ends='h')) # write unoptimized structure polymer.write(NAME+'.mol') mol_polymer = rdkit.MolFromMolFile(NAME + '.mol') #print(f'{NAME} has {polymer.mol.get_no_atoms()} atoms!') print(f'Optimizing polymer {NAME} and saving {N} conformers') # clean molecule with ETKDG embedder = stk.UFF(use_cache=False) embedder.optimize(polymer, conformer=-1) # write optimized polymer to json polymer.dump(NAME+'_opt.json') polymer.write(NAME+'_opt.mol') # make N conformers of the polymer molecule etkdg = rdkit.ETKDGv2() etkdg.randomSeed = 1000 etkdg.verbose = True etkdg.maxIterations = 200000 cids = rdkit.EmbedMultipleConfs( mol=polymer.mol, numConfs=N, params=etkdg ) print(f'Made {len(cids)} conformers...') print(f'Warning! I have not implemented an optimization of the ETKDG cleaned polymers!') # iterate over conformers and save structure file_dir = '/home/fanyuzhao/Monomers/OH+F/dimer/conformers/' new_dir = file_dir+NAME+'_'+str(units)+'_'+str(radius)+'/' for cid in cids: # build directories if not os.path.exists(new_dir): os.makedirs(new_dir) # write optimized polymer to mol polymer.write(new_dir+NAME+'_'+str(cid)+'_opt.mol', conformer=cid) # write optimized polymer to pdb polymer.write(new_dir+NAME+'_'+str(cid)+'_opt.pdb', conformer=cid) print(f'Done! {N} ETKDG conformers of polymer written to {NAME}_{N}_opt.mol/pdb') # pdb file from stk can not be read in freesasa # save the new pdb file in rdkit from mol files for item in os.listdir(new_dir): if item.endswith('.mol'): file_pdb = item.replace('.mol', '') a = rdkit.MolFromMolFile(os.path.join(new_dir, item)) # hydrogens are removed when converting the file in rdkit b = rdkit.AddHs(a, addCoords = True) rdkit.MolToPDBFile(b, new_dir + file_pdb + '_new.pdb') # calculate solvent accessible surface area(probe radius = 1.4Å and 3.6Å) # hydrogens are removed in the default option # hetatm are ignored in the default option options_with_Hs = { 'hetatm' : True, 'hydrogen' : True, 'join-models' : False, 'skip-unknown' : False, 'halt-at-unknown' : False } sa_list = [] pdb_list = [] # loop all new pdb files for pdb in os.listdir(new_dir): if pdb.endswith("_new.pdb"): # use freesasa to calculate SASA para = freesasa.Parameters() freesasa.Parameters.setProbeRadius(para, radius) free_struct = freesasa.Structure(os.path.join(new_dir, pdb), options = options_with_Hs) free_calc = freesasa.calc(free_struct, para) total = free_calc.totalArea() # keep 3 decimals decimal = round(total, 4) sa_list.append(decimal) name_pdb = pdb.replace('.pdb', '') pdb_list.append(name_pdb) # calculate average SASA(probe radius = 1.4Å) sa_average = round(sum(sa_list) / len(sa_list), 4) atom_number = mol_polymer.GetNumAtoms() normalized_sa = round(sa_average / atom_number, 4) with open (file_dir + 'Average surface area of conformers.txt', 'a+') as Asa: Asa.write(f'The normalized surface area of {NAME}_{units} is ' + str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + f'Å and chain length of {units}.\n') print ('The avarage surface area of the conformers is ' + str(sa_average) + ' Å^2 with the probe size of ' + str(radius) + 'Å.') # save data to a csv table # save pdb file and surface area to a directory dic = {p: s for p, s in zip(pdb_list, sa_list)} download_dict = new_dir + 'Solvent accessible surface area of ' + NAME +'.csv' csv = open(download_dict, 'w') columnTitleRow = "Polymer_name, SASA\n" csv.write(columnTitleRow) for key in dic.keys(): Polymer_name = key SASA = dic[key] row = Polymer_name + "," + str(SASA) + "\n" csv.write(row) print ('Nomalized solvent accessible surface area is '+ str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.')
def __init__(self, comb, pdb_acc_code, chain, **kwargs): """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info :pdb_acc_code: type: str: 4 character pdb accession code :param kwargs: path_to_pdb path_to_dssp """ #search for acc code in input_dir_pdb from comb object. assert isinstance(pdb_acc_code, str), 'PDB accession code needs to be a string' pdb_file = [ file.name for file in os.scandir(comb.input_dir_pdb) if pdb_acc_code in file.name ] try: if pdb_file: pdb_file = pdb_file[0] self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file, altloc='A', model=1) elif 'path_to_pdb' in kwargs: self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'), altloc='A', model=1) else: # NEED TO UPDATE: note if going to fetch pdb, it should be sent through Reduce first... try: os.mkdir(comb.input_dir_pdb + 'raw') os.mkdir(comb.input_dir_pdb + 'reduce') except: pass pr.fetchPDB(pdb_acc_code, compressed=False, folder=comb.input_dir_pdb + 'raw') os.system(comb.path_to_reduce + comb.reduce + ' -FLIP -Quiet -DB ' + comb.path_to_reduce + 'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb + 'raw/' + pdb_acc_code.lower() + '.pdb > ' + comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb') self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb', altloc='A', model=1) except NameError: raise NameError( 'ParsePDB instance needs a pdb file path or a valid pdb accession code.' ) self.pdb_acc_code = pdb_acc_code.lower() self.pdb_chain = chain if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \ and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None: self.contacts = pr.Contacts(self.prody_pdb) self.set_bonds() if pdb_file: self.fs_struct = freesasa.Structure(comb.input_dir_pdb + pdb_file) elif 'path_to_pdb' in kwargs: self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb')) else: path = comb.input_dir_pdb + 'reduce/' self.fs_struct = freesasa.Structure(path + next( file.name for file in os.scandir(path) if self.pdb_acc_code in file.name)) self.fs_result = freesasa.calc(self.fs_struct) self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3) self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4) self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5) self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select( 'protein and (backbone or name CB) ' 'and not element H D').getIndices() dssp_file = [ file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name ] if dssp_file: dssp_file = dssp_file[0] self.dssp = pr.parseDSSP(comb.input_dir_dssp + dssp_file, self.prody_pdb) elif 'path_to_dssp' in kwargs: self.dssp = pr.parseDSSP(kwargs.get('path_to_dssp'), self.prody_pdb) else: if pdb_file: pr.execDSSP(comb.input_dir_pdb + pdb_file, outputdir=comb.input_dir_dssp) elif 'path_to_pdb' in kwargs: pr.execDSSP(kwargs.get('path_to_pdb'), outputdir=comb.input_dir_dssp) else: path = comb.input_dir_pdb + 'reduce/' + next( file.name for file in os.scandir(comb.input_dir_pdb + 'reduce') if pdb_acc_code in file.name) pr.execDSSP(path, outputdir=comb.input_dir_dssp) self.dssp = pr.parseDSSP( comb.input_dir_dssp + next(file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name), self.prody_pdb) self.possible_ifgs = self.find_possible_ifgs(comb) else: self.possible_ifgs = None # valence and hydrogen bond data for vandermers and iFGs of ParsedPDB protein instance # iFG specific: self._ifg_pdb_info = [] self._ifg_atom_density = [] self._ifg_contact_water = [] self._ifg_contact_ligand = [] self._ifg_contact_metal = [] # vdM specific: self._vdm_pdb_info = [] self._vdm_sasa_info = [] self._ifg_contact_vdm = [] self._ifg_hbond_vdm = [] self._ifg_hbond_water = [] self._ifg_hbond_ligand = [] self._ifg_ca_hbond_vdm = []
def openfile(): global prob, probab, te global my_seq global anti global structure, structure_id, filename global antigenicity, hydro, flex, sec global m, a, c, b, length, j, k global hydroph, flexi, access anti = [] sec = [] probab = [] from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) filename = root.filename print(filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) print(m) length = len(m) # type: int print("Sequence consist of", length, "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) hydro = list(analysed_seq.protein_scale(kd, 10, 1.0)) flex = list(analysed_seq.flexibility()) hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0)) flexi = list(analysed_seq.flexibility()) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) a = a[0] sec.append(a) i += 1 j += 1 k += 1 f = length r = 1 y = 10 global acc, logacc acc = [] for i in range(0, f): str1 = "accessibility, resi " str2 = str(r) + "-" + str(y) saving = str1 + str2 print(saving) r = r + 1 y = y + 1 structure = freesasa.Structure("1e6j.pdb") resulta = freesasa.calc(structure) area_classes = freesasa.classifyResults(resulta, structure) print("Total : %.2f A2" % resulta.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key]) resulta = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 10 })) selections = freesasa.selectArea(('alanine, resn ala', saving), structure, resulta) for key in selections: print(key, ": %.2f A2" % selections[key]) a = selections[key] acc.append(a) l = acc[0::2] access = l print(acc) print(l) logacc = [math.log(y, 10) for y in l] print(logacc)
def _get_item_src(self, decoy): """ decoy: str, path to the decoy """ atom_to_num = { "C": 1, "N": 2, "O": 3, "S": 4 } residues = [] atom_positions = self.create_atom_positions() residue = self.build_residue() structure = fs.Structure(decoy) solvent_access = fs.calc(structure) with open(decoy, "r") as f: line = f.readline().rstrip() while not line.startswith("ATOM"): line = f.readline().rstrip() cur_resi = int(line[22:26]) # PDB file stardard format # COLUMNS DATA TYPE FIELD # ------------------------------------------- # 1 - 6 Record name "ATOM " # 7 - 11 Integer Atom serial # # 13 - 16 Atom Atom name # 17 Character Alternate location # 18 - 20 Residue name resName # 22 Character chainID # 23 - 26 Integer resSeq # 27 AChar Code for insertion of residues # 31 - 38 Real(8.3) x # 39 - 46 Real(8.3) y # 47 - 54 Real(8.3) z # 55 - 60 Real(6.2) occupancy # 61 - 66 Real(6.2) tempFactor # 77 - 78 LString(2) element # 79 - 80 LString(2) Charge on the atom while line: if line.startswith("TER"): break if not line.startswith("ATOM"): line = f.readline().rstrip() continue # ignore hydrogens atom_type = line[-1] if atom_type == "H": line = f.readline().rstrip() continue resi_num = int(line[22:26]) if resi_num > cur_resi: residues.append(residue) if len(residues) == 400: break residue = self.build_residue() cur_resi = resi_num residue = self._put_atom_src( line.rstrip(), residue, solvent_access, atom_positions, atom_to_num) line = f.readline().rstrip() # normalize residues pc = np.ones((self.npoints, self.num_channel())) * float("-inf") residues = np.array(residues) logging.debug("decoy shape: {}".format(residues.shape)) x_mean = np.mean(residues[:, 1]) y_mean = np.mean(residues[:, 2]) z_mean = np.mean(residues[:, 3]) for i in range(self.num_channel() // self.ATTRIBUTES_EACH_ATOM): residues[:, self.ATTRIBUTES_EACH_ATOM*i+1] -= x_mean residues[:, self.ATTRIBUTES_EACH_ATOM*i+2] -= y_mean residues[:, self.ATTRIBUTES_EACH_ATOM*i+3] -= z_mean pc[0:residues.shape[0], :] = residues target_path = os.path.dirname(decoy) gdt_ts = 0.0 with open(os.path.join(target_path, "list.dat"), "r") as lst: info = lst.readline() while info: if info.startswith(os.path.basename(decoy)): gdt_ts = float( info.split()[CASPDataset.list_dat["gdt_ts"]]) break info = lst.readline() return pc, gdt_ts
def get_surface_resids(structure, cutoff=15, config_path=os.environ.get('FREESASA_CONFIG')): """ Calls freesasa using its Python API and returns per-residue accessibilities. """ try: from freesasa import Classifier, structureFromBioPDB, calc except ImportError as err: print( '[!] The binding affinity prediction tools require the \'freesasa\' Python API', file=sys.stderr) raise ImportError(err) import pkg_resources asa_data, rsa_data, rel_main_chain, rel_side_chain = {}, {}, {}, {} _rsa = rel_asa['total'] _rsa_bb = rel_asa['bb'] _rsa_sc = rel_asa['sc'] classifier = Classifier(config_path) pkg_resources.cleanup_resources() with stdchannel_redirected(sys.stderr, os.devnull): struct = structureFromBioPDB( structure, classifier, ) result = calc(struct) # iterate over all atoms to get SASA and residue name for idx in range(struct.nAtoms()): atname = struct.atomName(idx).strip() resname = struct.residueName(idx) resid = int(struct.residueNumber(idx)) chain = struct.chainLabel(idx) at_uid = (chain, resname, resid, atname) res_uid = (chain, resname, resid) asa = result.atomArea(idx) asa_data[at_uid] = asa # add asa to residue rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa if atname in ('C', 'N', 'O'): rel_main_chain[res_uid] = rel_main_chain.get(res_uid, 0) + asa else: rel_side_chain[res_uid] = rel_side_chain.get(res_uid, 0) + asa # convert total asa ro relative asa rsa_data.update( (res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items()) rel_main_chain.update((res_uid, asa / _rsa_bb[res_uid[1]] * 100) for res_uid, asa in rel_main_chain.items()) rel_side_chain.update((res_uid, asa / _rsa_sc[res_uid[1]] * 100) for res_uid, asa in rel_side_chain.items()) # We format to fit the pipeline resid_access = {} for res_uid, access in rel_main_chain.items(): resid_access[res_uid[2]] = { 'side_chain_rel': rel_side_chain.get(res_uid), 'main_chain_rel': access } surface_resids = [ r for r, v in resid_access.items() if v['side_chain_rel'] >= cutoff or v['main_chain_rel'] >= cutoff ] return surface_resids
def parse_pdb_coordinates(pdb_path: str, start_position: int, end_position: int, position_correction: int, chain: str, sasa: bool = False) -> DataFrame: """ Parse coordinate of CA atoms. Will also return the bfactor and SASA using freesasa. If PDB is missing atoms, it can handle it. """ # Get structure from PDB structure = PDBParser().get_structure('pdb', pdb_path) coordinates = [] commands = [] bfactors = [] positions_worked = [] # positions present in pdb # Iterate over each CA atom and geet coordinates for i in np.arange(start_position + position_correction, end_position + position_correction): # first check if atom exists try: structure[0][chain][int(i)].has_id("CA") # Get atom from pdb and geet coordinates atom = list(structure[0][chain][int(i)]["CA"].get_vector()) + [i] coordinates.append(atom) # Get SASA command for each residue and bfactor residue = "s{}, chain {} and resi {}".format(str(i), chain, str(i)) commands.append(residue) bfactor = (structure[0][chain][int(i)]["CA"].get_bfactor()) bfactors.append(np.log10(bfactor)) positions_worked.append(i) except: print("residue {} not found".format(str(i))) coordinates.append([np.nan, np.nan, np.nan, i]) # Convert to df df_coordinates = DataFrame(columns=['x', 'y', 'z', 'Position'], data=coordinates) # Center data x, y, z = centroid(df_coordinates) df_coordinates['x_cent'] = (df_coordinates['x'] - x).abs()**2 df_coordinates['y_cent'] = (df_coordinates['y'] - y).abs()**2 df_coordinates['z_cent'] = (df_coordinates['z'] - z).abs()**2 df_coordinates['Distance'] = df_coordinates['x_cent'] + df_coordinates[ 'y_cent'] + df_coordinates['z_cent'] # Add sasa values if sasa: # Get structure for SASA structure_sasa = freesasa.Structure(pdb_path) result = freesasa.calc(structure_sasa) # Calculate sasa sasa_area = freesasa.selectArea(commands, structure_sasa, result) df_sasa: DataFrame = DataFrame(columns=['SASA'], data=sasa_area.values()) df_sasa['log B-factor'] = bfactors df_sasa['Position'] = positions_worked # Merge df_coordinates = df_coordinates.merge(df_sasa, how='outer', on='Position') return df_coordinates
def _get_docking_model(self, molecule, restraints): atoms = molecule.atoms parsed_restraints = {} # Assign properties to atoms for atom_index, atom in enumerate(atoms): res_id = "%s.%s.%s" % (atom.chain_id, atom.residue_name, str(atom.residue_number)) if restraints and res_id in restraints: try: parsed_restraints[res_id].append(atom_index) except: parsed_restraints[res_id] = [atom_index] res_name = atom.residue_name atom_name = atom.name if res_name == "HIS": res_name = 'HID' if atom_name in amber.translate: atom_name = amber.translate[atom.name] atom_id = "%s-%s" % (res_name, atom_name) atom.amber_type = amber.amber_types[atom_id] atom.charge = amber.charges[atom_id] atom.mass = amber.masses[atom.amber_type] atom.vdw_energy = vdw.vdw_energy[atom.amber_type] atom.vdw_radius = vdw.vdw_radii[atom.amber_type] # Prepare common model information elec_charges = np.array([atom.charge for atom in atoms]) vdw_energies = np.array([atom.vdw_energy for atom in atoms]) vdw_radii = np.array([atom.vdw_radius for atom in atoms]) coordinates = molecule.copy_coordinates() des_energy, des_radii = solvation.get_solvation(molecule) # Calculate desolvation reference energy log.info('Calculating reference SASA...') structure = Structure() des_radii_no_H = [] for i, atom in enumerate(atoms): if not atom.is_hydrogen(): structure.addAtom(atom.name, atom.residue_name, atom.residue_number, atom.chain_id, atom.x, atom.y, atom.z) des_radii_no_H.append(des_radii[i]) structure.setRadii(list(des_radii_no_H)) sasa_result = freesasa.calc(structure) sasa = [] j = 0 for i, atom in enumerate(atoms): if not atom.is_hydrogen(): sasa.append(sasa_result.atomArea(j)) j += 1 else: sasa.append(-1.0) sasa = np.array(sasa) hydrogens = np.array( [0 if atom.is_hydrogen() else 1 for atom in atoms]) log.info('Done.') reference_points = ModelAdapter.load_reference_points(molecule) try: return CPyDockModel(atoms, coordinates, parsed_restraints, elec_charges, vdw_energies, vdw_radii, des_energy, des_radii, sasa, hydrogens, reference_points=reference_points, n_modes=molecule.n_modes.copy()) except AttributeError: return CPyDockModel(atoms, coordinates, parsed_restraints, elec_charges, vdw_energies, vdw_radii, des_energy, des_radii, sasa, hydrogens, reference_points=reference_points)
def CalCSASA(self, sasaStruct): SASACalc = freesasa.calc(sasaStruct) return SASACalc
char_at_base = [] parser = argparse.ArgumentParser() parser.add_argument("--infile", type=str, default="data/test.zip") parser.add_argument("--model", type=str, default="model.pkl") args = parser.parse_args() #protein_parser = PDBParser() with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): struct = freesasa.Structure(str(test_pdb)) result = freesasa.calc(struct) areas_classes = freesasa.classifyResults(result, struct) list_areas = [(list(areas_classes.values())[0]), (list(areas_classes.values())[1]), result.totalArea()] polar_area.append(list_areas[0]) apolar_area.append(list_areas[1]) total_area.append(list_areas[2]) print('done') with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path)
def handle(self, *args, **options): def recurse(entity, slist): """ filter a pdb structure in a recursive way entity: the pdb entity, a structure should be given on the top level slist: the list of filter criterias, for each level. """ for subenty in entity.get_list(): if not subenty.id in slist[0]: entity.detach_child(subenty.id) elif slist[1:]: recurse(subenty, slist[1:]) def cal_pseudo_CB(r): """ Calculate pseudo CB for Glycin from Bio pdb faq """ a = r['CA'].get_vector() n = r['N'].get_vector() - a c = r['C'].get_vector() - a rot = pdb.rotaxis(-np.pi * 120.0 / 180.0, c) b = n.left_multiply(rot) + a return b.get_array() def pca_line(pca, h, r=0): """ Calculate the pca for h and return the first pc transformed back to the original coordinate system """ if ((not r) if pca.fit_transform(h)[0][0] < 0 else r): return pca.inverse_transform( np.asarray([[-20, 0, 0], [20, 0, 0]])) else: return pca.inverse_transform( np.asarray([[20, 0, 0], [-20, 0, 0]])) def calc_angle(b, c): """ Calculate the angle between c, b and the orthogonal projection of b to the x axis. """ ba = -b bc = c + ba ba[:, 0] = 0 return np.degrees( np.arccos( inner1d(ba, bc) / (np.linalg.norm(ba, axis=1) * np.linalg.norm(bc, axis=1)))) def ca_cb_calc(ca, cb, pca): """ Calcuate the angles between ca, cb and center axis """ return calc_angle(pca.transform(ca), pca.transform(cb)) def axes_calc(h, p, pca): """ Calculate the orthogonal projection of the CA to the helix axis which is moved to the mean of three consecutive amino acids """ a = (np.roll(np.vstack((h, h[0])), 1, axis=0)[:-1] + h + np.roll(np.vstack((h, h[-1])), -1, axis=0)[:-1]) / 3 b = p.transform(h) b[:, 1:] = p.transform(a)[:, 1:] b = p.inverse_transform(b) return calc_angle(pca.transform(b), pca.transform(h)) def set_bfactor(chain, angles): """ simple helper to set the bfactor of all residues by some value of a list """ for r, an in zip(chain.get_list(), angles): for a in r: a.set_bfactor(an) def qgen(x): """ Helper function to slice a list of all residues of a protein of the list of the residues of all proteins """ start = False for i in range(len(qset) - 1, 0, -1): if not start and qset[i].protein_conformation.protein == x: start = i if start and qset[i].protein_conformation.protein != x: if start != len(qset) - 1: del qset[start + 1:] return qset[i + 1:] return qset[i + 1:] del qset[start + 1:] return qset failed = [] # get preferred chain for PDB-code references = Structure.objects.filter( protein_conformation__protein__family__slug__startswith="001" ).exclude(refined=True).prefetch_related( 'pdb_code', 'pdb_data', 'protein_conformation').order_by('protein_conformation__protein') references = list(references) pids = [ref.protein_conformation.protein.id for ref in references] qset = Residue.objects.filter( protein_conformation__protein__id__in=pids) qset = qset.filter( generic_number__label__regex=r'^[1-7]x[0-9]+').order_by( '-protein_conformation__protein', '-generic_number__label') qset = list( qset.prefetch_related('generic_number', 'protein_conformation')) res_dict = { ref.pdb_code.index: qgen(ref.protein_conformation.protein) for ref in references } ####################################################################### ######################### Start of main loop ########################## ####################################################################### for reference in references: preferred_chain = reference.preferred_chain.split(',')[0] pdb_code = reference.pdb_code.index state_id = reference.protein_conformation.state.id try: print(pdb_code) structure = self.load_pdb_var(pdb_code, reference.pdb_data.pdb) pchain = structure[0][preferred_chain] ####################################################################### ###################### prepare and evaluate query ##################### db_reslist = res_dict[pdb_code] ####################################################################### ######################### filter data from db ######################### def reslist_gen(x): try: while db_reslist[-1].generic_number.label[0] == x: yield db_reslist.pop() except IndexError: pass # when gdict is not needed the helper can be removed #db_tmlist = [[(' ',r.sequence_number,' ') for r in reslist_gen(x) if r.sequence_number in pchain and r.sequence_number < 1000] for x in ["1","2","3","4","5","6","7"]] db_helper = [[ (r.generic_number.label, r.sequence_number) for r in reslist_gen(x) if r.sequence_number in pchain and r.sequence_number < 1000 ] for x in ["1", "2", "3", "4", "5", "6", "7"]] gdict = {r[1]: r[0] for hlist in db_helper for r in hlist} db_tmlist = [[(' ', r[1], ' ') for r in sl] for sl in db_helper] db_set = set(db_tmlist[0] + db_tmlist[1] + db_tmlist[2] + db_tmlist[3] + db_tmlist[4] + db_tmlist[5] + db_tmlist[6]) ####################################################################### ############################# filter pdb ############################# recurse(structure, [[0], preferred_chain, db_set]) ####################################################################### ############### Calculate the axes through the helices ################ ####################################################################### N = 3 hres_list = [ np.asarray([pchain[r]["CA"].get_coord() for r in sl], dtype=float) for sl in db_tmlist ] h_cb_list = [ np.asarray([ pchain[r]["CB"].get_coord() if "CB" in pchain[r] else cal_pseudo_CB(pchain[r]) for r in sl ], dtype=float) for sl in db_tmlist ] # fast and fancy way to take the average of N consecutive elements hres_three = np.asarray([ sum([h[i:-(len(h) % N) or None:N] for i in range(N)]) / N for h in hres_list ]) ####################################################################### ################################# PCA ################################# ####################################################################### helix_pcas = [PCA() for i in range(7)] [ pca_line(helix_pcas[i], h, i % 2) for i, h in enumerate(hres_three) ] # extracellular part if extra_pca: helices_mn = np.asarray( [np.mean(h, axis=0) for h in hres_three]) pos_list = np.asarray([ pca_line(PCA(), h[:len(h) // 2:(-(i % 2) or 1)]) for i, h in enumerate(hres_three) ]) pos_list = pos_list - (np.mean(pos_list, axis=1) - helices_mn).reshape(-1, 1, 3) pca = PCA() pca_line(pca, np.vstack(pos_list)) else: pca = PCA() pca_line(pca, np.vstack(hres_three)) ####################################################################### ################################ Angles ############################### ####################################################################### ########################### Axis to CA to CB ########################## angle = np.concatenate([ ca_cb_calc(ca, cb, pca) for ca, cb in zip(hres_list, h_cb_list) ]) set_bfactor(pchain, angle) if print_pdb: self.save_pdb(structure, pdb_code + 'angle_colored_ca_cb.pdb') ######################### Axis to Axis to CA ########################## angle2 = np.concatenate([ axes_calc(h, p, pca) for h, p in zip(hres_list, helix_pcas) ]) set_bfactor(pchain, angle2) if print_pdb: self.save_pdb(structure, pdb_code + 'angle_colored_axes.pdb') ################################ SASA ################################# if SASA: pdbstruct = freesasa.Structure("pymol_output/" + pdb_code + 'angle_colored_axes.pdb') res = freesasa.calc(pdbstruct) asa_list = [] oldnum = -1 for i in range(res.nAtoms()): resnum = pdbstruct.residueNumber(i) if resnum == oldnum: asa_list[-1] += res.atomArea(i) else: asa_list.append(res.atomArea(i)) oldnum = resnum set_bfactor(pchain, asa_list) if print_pdb: self.save_pdb(structure, pdb_code + 'asa_colored.pdb') ################################# HSE ################################# if HSE: hse = pdb.HSExposure.HSExposureCB(structure[0]) [[a.set_bfactor(x[1][1]) for a in x[0]] for x in hse] if print_pdb: self.save_pdb(structure, pdb_code + 'hsea_colored.pdb') ############################### pickle ################################ if HSE and SASA: reslist = [] grslist = [] hse = [] for r in pchain: reslist.append(r.id[1]) grslist.append(gdict[r.id[1]]) hse.append(r["CA"].get_bfactor()) with open('pymol_output/' + pdb_code + '_measures.pickle', 'wb') as handle: pickle.dump( (np.array(reslist), grslist, np.array(asa_list), np.array(hse), angle, angle2, state_id), handle) #Angle.objects.bulk_create([Angle(residue=gdict[res.id[1]], angle=res["CA"].get_bfactor(), structure=reference) for res in pchain]) except Exception as e: print("ERROR!!", pdb_code, e) failed.append(pdb_code) continue print(len(failed), "of", len(references), "failed:", failed)
def _get_docking_model(self, molecule, restraints): atoms = molecule.atoms parsed_restraints = {} # Assign properties to atoms for atom_index, atom in enumerate(atoms): res_id = "%s.%s.%s" % (atom.chain_id, atom.residue_name, str(atom.residue_number)) if restraints and res_id in restraints: try: parsed_restraints[res_id].append(atom_index) except: parsed_restraints[res_id] = [atom_index] res_name = atom.residue_name atom_name = atom.name if res_name == "HIS": res_name = 'HID' if atom_name in amber.translate: atom_name = amber.translate[atom.name] atom_id = "%s-%s" % (res_name, atom_name) atom.amber_type = amber.amber_types[atom_id] atom.charge = amber.charges[atom_id] atom.mass = amber.masses[atom.amber_type] atom.vdw_energy = vdw.vdw_energy[atom.amber_type] atom.vdw_radius = vdw.vdw_radii[atom.amber_type] # Prepare common model information elec_charges = np.array([atom.charge for atom in atoms]) vdw_energies = np.array([atom.vdw_energy for atom in atoms]) vdw_radii = np.array([atom.vdw_radius for atom in atoms]) coordinates = molecule.copy_coordinates() des_energy, des_radii = solvation.get_solvation(molecule) # Calculate desolvation reference energy log.info('Calculating reference SASA...') structure = Structure() des_radii_no_H = [] for i, atom in enumerate(atoms): if not atom.is_hydrogen(): structure.addAtom(atom.name, atom.residue_name, atom.residue_number, atom.chain_id, atom.x, atom.y, atom.z) des_radii_no_H.append(des_radii[i]) structure.setRadii(list(des_radii_no_H)) sasa_result = freesasa.calc(structure) sasa = [] j = 0 for i, atom in enumerate(atoms): if not atom.is_hydrogen(): sasa.append(sasa_result.atomArea(j)) j += 1 else: sasa.append(-1.0) sasa = np.array(sasa) hydrogens = np.array([0 if atom.is_hydrogen() else 1 for atom in atoms]) log.info('Done.') reference_points = ModelAdapter.load_reference_points(molecule) try: return CPyDockModel(atoms, coordinates, parsed_restraints, elec_charges, vdw_energies, vdw_radii, des_energy, des_radii, sasa, hydrogens, reference_points=reference_points, n_modes=molecule.n_modes.copy()) except AttributeError: return CPyDockModel(atoms, coordinates, parsed_restraints, elec_charges, vdw_energies, vdw_radii, des_energy, des_radii, sasa, hydrogens, reference_points=reference_points)