def write_superposed_pdbs(self, output_pdb_folder, alignments: dict = None): """ Superposes PDBs according to alignment and writes transformed PDBs to files (View with Pymol) Parameters ---------- alignments output_pdb_folder """ if alignments is None: alignments = self.alignment output_pdb_folder = Path(output_pdb_folder) if not output_pdb_folder.exists(): output_pdb_folder.mkdir() reference_name = self.structures[0].name reference_pdb = pd.parsePDB( str(self.output_folder / f"cleaned_pdb/{self.structures[0].name}.pdb") ) core_indices = np.array( [ i for i in range(len(alignments[reference_name])) if -1 not in [alignments[n][i] for n in alignments] ] ) aln_ref = alignments[reference_name] ref_coords_core = ( reference_pdb[helper.get_alpha_indices(reference_pdb)] .getCoords() .astype(np.float64)[np.array([aln_ref[c] for c in core_indices])] ) ref_centroid = helper.nb_mean_axis_0(ref_coords_core) ref_coords_core -= ref_centroid transformation = pd.Transformation(np.eye(3), -ref_centroid) reference_pdb = pd.applyTransformation(transformation, reference_pdb) pd.writePDB(str(output_pdb_folder / f"{reference_name}.pdb"), reference_pdb) for i in range(1, len(self.structures)): name = self.structures[i].name pdb = pd.parsePDB( str(self.output_folder / f"cleaned_pdb/{self.structures[i].name}.pdb") ) aln_name = alignments[name] common_coords_2 = ( pdb[helper.get_alpha_indices(pdb)] .getCoords() .astype(np.float64)[np.array([aln_name[c] for c in core_indices])] ) ( rotation_matrix, translation_matrix, ) = superposition_functions.svd_superimpose( ref_coords_core, common_coords_2 ) transformation = pd.Transformation(rotation_matrix.T, translation_matrix) pdb = pd.applyTransformation(transformation, pdb) pd.writePDB(str(output_pdb_folder / f"{name}.pdb"), pdb)
def write_superposed_pdbs_reference(self, output_pdb_folder, alignments): """ Superposes PDBs according to reference structure and writes transformed PDBs to files (View with Pymol) Parameters ---------- alignments output_pdb_folder """ reference_name = self.structures[self.reference_structure_index].name reference_pdb = pd.parsePDB( str(self.output_folder / f"cleaned_pdb/{self.structures[self.reference_structure_index].name}.pdb" )) aln_ref = alignments[reference_name] reference_coords = (reference_pdb[helper.get_alpha_indices( reference_pdb)].getCoords().astype(np.float64)) pd.writePDB(str(output_pdb_folder / f"{reference_name}.pdb"), reference_pdb) for i in range(1, len(self.structures)): name = self.structures[i].name pdb = pd.parsePDB( str(self.output_folder / f"cleaned_pdb/{self.structures[i].name}.pdb")) aln_name = alignments[name] common_coords_1, common_coords_2 = get_common_coordinates( reference_coords, pdb[helper.get_alpha_indices(pdb)].getCoords().astype( np.float64), aln_ref, aln_name, ) ( rotation_matrix, translation_matrix, ) = superposition_functions.svd_superimpose( common_coords_1, common_coords_2) transformation = pd.Transformation(rotation_matrix.T, translation_matrix) pdb = pd.applyTransformation(transformation, pdb) pd.writePDB(str(output_pdb_folder / f"{name}.pdb"), pdb)
def apply_transformation(self, pdb_file, ligand_resnum, target_fragment_atom_serials, transformation_matrix): """ Apply transformation to the target ligand-protein complex. Also considering: * Only work with residues with CA within 8A of ligand * Write all transformed PDBs to a new working directory? :param transformation_matrix: :param target_pdb: :return: """ # Only work with residues within 12A of target ligand target_pdb = prody.parsePDB(pdb_file) target_shell = target_pdb.select( '(protein and within 12 of (serial {0}) and not resnum {1}) or (serial {0})' .format(' '.join(target_fragment_atom_serials), ligand_resnum)) transformed_pdb = prody.applyTransformation(transformation_matrix, target_shell) return transformed_pdb
args = get_arguments() protein = parsePDB(args.pdb_structure) protein = protein.select('protein').copy() logger.info('%s loaded' % args.pdb_structure) if args.to_center: logger.info('Moving original structure to the center') moveAtoms(protein, to=np.zeros(3), ag=True) if args.random_rotation: logger.info('Rotating the structure randomly') random_rotation_matrix = get_affine(get_random_rotation_matrix()) random_rotation = Transformation(random_rotation_matrix) applyTransformation(random_rotation, protein) ca_atoms = protein.select('protein and name CA') protein_anm = ANM('%s ca' % args.structure_name) protein_anm.buildHessian(ca_atoms) protein_anm.calcModes(n_modes=args.normal_modes) print 'Normal modes calculated' protein_anm_ext, protein_all = extendModel(protein_anm, ca_atoms, protein, norm=True) print 'Normal modes extended' if args.save_models: saveAtoms(protein, args.structure_name) saveModel(protein_anm, args.structure_name) saveModel(protein_anm_ext, args.structure_name)
def score_interaction_and_dump(parsed, ifgresn, vdmresn, ifg_contact_atoms, vdm_contact_atoms, method, targetresi, cutoff, pdbix, pdbname): cutoff = float(cutoff) ifgtype, vdmtype, ifginfo, vdminfo = get_ifg_vdm(parsed, ifgresn, vdmresn, ifg_contact_atoms, vdm_contact_atoms, method) if ifgtype[1] != ['N', 'CA', 'C'] and ifgtype[1] != ['CA', 'C', 'O']: ifgresn = constants.AAname_rev[ifgtype[0]] vdmresn = constants.AAname_rev[vdmtype[0]] ifgatoms = ifgtype[1] vdmatoms = vdmtype[1] # filter for only vdmresn vdms of ifgresn with ifgatoms # and vdmatoms directly involved in interactions num_all_vdms, lookupdf = filter_contact(ifgresn, vdmresn, ifgatoms, vdmatoms) query = [] for atom in ifgatoms: query.append( parsed.select('chain {} and resnum {} and name {}'.format( ifginfo[0], ifginfo[1], atom)).getCoords()[0]) for atom in vdmatoms: query.append( parsed.select('chain {} and resnum {} and name {}'.format( vdminfo[0], vdminfo[1], atom)).getCoords()[0]) query = np.array(query) lookupcoords = pkl.load( open( '/home/gpu/Sophia/combs/st_wd/Lookups/refinedvdms/coords_of_{}.pkl' .format(ifgtype[0]), 'rb')) #lookupcoords = lookupcoords[:50] # delete ifglists = flip(ifgatoms, ifgresn) vdmlists = flip(vdmatoms, vdmresn) rmsds = [] num_atoms = len(query) coords_ls = [ item for item in lookupcoords if item[0] in lookupdf.index ] lookupatoms_to_clus = [] counter = 0 # to keep count of how many pdbs are being output for item in coords_ls: if len(item) == 3: compare_rmsds = [] ifg_vdm_ind = [] for ifg_ind, ifgls in enumerate(ifglists): for vdm_ind, vdmls in enumerate(vdmlists): lookupatoms = get_order_of_atoms( item, ifgresn, vdmresn, ifgls, vdmls) moved, transf = pr.superpose(lookupatoms, query) temp_rmsd = pr.calcRMSD(moved, query) compare_rmsds.append(temp_rmsd) ifg_vdm_ind.append([moved, temp_rmsd]) # item[0] is df index rmsds.append([item[0], min(compare_rmsds)]) # get index of which one had min rmsd for which_ind, each in enumerate(ifg_vdm_ind): if each[1] == min(compare_rmsds): lookupatoms_to_clus.append(each[0]) ######################################################################## # output pdb if low rmsd ######################################################################## if each[1] < cutoff and counter < 30 and which_ind == 0: # this is to ensure rmsd is below cutoff when not flipped # bc don't want to take care of that in prody to output pdb row = lookupdf.loc[item[0]] try: db_dir = '/home/gpu/Sophia/STcombs/20171118/database/reduce/' par = pr.parsePDB(db_dir + row['pdb'] + 'H.pdb') except: db_dir = '/home/gpu/Sophia/combs/st_wd/20180207_db_molprobity_biolassem/' par = pr.parsePDB(db_dir + row['pdb'] + 'H.pdb') ifgchid, ifgresnum = row['chid_ifg'], row[ 'resnum_ifg'] vdmchid, vdmresnum = row['chid_vdm'], row[ 'resnum_vdm'] printout = copy.deepcopy(par) printout = printout.select( '(chain {} and resnum {}) or (chain {} and resnum {})' .format(ifgchid, ifgresnum, vdmchid, vdmresnum)) printout.select('chain {} and resnum {}'.format( ifgchid, ifgresnum)).setChids('Y') printout.select('chain {} and resnum {}'.format( vdmchid, vdmresnum)).setChids('X') printout.select('all').setResnums(10) printout_interactamer = [] integrin_interactamer = [] try: # skip the ones that have segment ids. will prob need to update this # for the newly combed stuff for atom in ifgatoms: integrin_interactamer.append( parsed.select( 'chain {} and resnum {} and name {}' .format(ifginfo[0], ifginfo[1], atom))) printout_interactamer.append( printout.select( 'chain Y and resnum 10 and name {}' .format(atom))) for atom in vdmatoms: integrin_interactamer.append( parsed.select( 'chain {} and resnum {} and name {}' .format(vdminfo[0], vdminfo[1], atom))) printout_interactamer.append( printout.select( 'chain X and resnum 10 and name {}' .format(atom))) integrin_interactamer_prody = [] integrin_interactamer = sum( integrin_interactamer[1:], integrin_interactamer[0]) printout_interactamer = sum( printout_interactamer[1:], printout_interactamer[0]) try: assert len(integrin_interactamer) == len( printout_interactamer) interact_res = printout.select( '(chain X and resnum 10) or (chain Y and resnum 10)' ) interactamer_transf = pr.applyTransformation( transf, printout_interactamer) outdir = './output_data/pdbfiles/' threecode = constants.AAname[ifgresn] pr.writePDB( outdir + '{}_{}_{}_{}{}_{}{}_{}_{}'.format( pdbix, pdbname, targetresi, ifginfo[1], ifgresn, vdminfo[1], vdmresn, cutoff, row.name), interactamer_transf) counter += 1 except: pass except: traceback.print_exc() pass else: rmsds.append([int(item[0]), 100000]) # count how many NNs the query intrxn has num_nn, norm_metrics = get_NN(lookupatoms_to_clus, num_atoms, rmsds, query, cutoff, num_all_vdms) print('num NN') print(num_nn) exp_list = norm_metrics[-1] print('======= FOR NEAREST NEIGHBORS ==========') print('avg with single') print(exp_list[0]) print('avg without single') print(exp_list[1]) print('median with single') print(exp_list[2]) print('median without single') print(exp_list[3]) # do greedy clustering D = make_pairwise_rmsd_mat( np.array(lookupatoms_to_clus).astype('float32')) D = make_square(D) adj_mat = make_adj_mat(D, 0.5) mems, centroids = greedy(adj_mat) print('======= FOR GREEDY CLUS ==========') print('avg with singletons') print(np.mean([len(x) for x in mems])) print('avg without singletons') print(np.mean([len(x) for x in mems if len(x) > 1])) print('median with singletons') print(np.median([len(x) for x in mems])) print('median without singletons') print(np.median([len(x) for x in mems if len(x) > 1])) return ifginfo[0], ifginfo[1], ifgresn, vdminfo[0], vdminfo[1],\ vdmresn, ifgatoms, vdmatoms, num_nn, norm_metrics
def alignment_monstrosity(self, rmsd_cutoff=0.5, use_local_pdb_database=False, verify_substructure=True): """ Consequences of not thinking ahead... For each fragment, align all fragment-containing ligands to fragment Generate PDBs with aligned coordinate systems :param args: :param rmsd_cutoff: fragment alignment RMSD cutoff, anything higher gets rejected :return: """ # Create directory for processed PDBs rejected_dict = self.load_previously_rejected_pdbs() # Create directories... if not use_local_pdb_database: os.makedirs(self.pdb_bank_dir, exist_ok=True) os.makedirs(self.processed_PDBs_path, exist_ok=True) # If use_local_pdb_database=False, use PDB FTP to download all structures # Otherwise, all relevant structures should be found in the local PDB database if not use_local_pdb_database: prody.pathPDBFolder(folder=self.pdb_bank_dir) for current_fragment in self.pdb_ligand_json: # Only download PDBs that aren't already in PDB bank directory existing_PDBs = [ pdb[:4].lower() for pdb in os.listdir(self.pdb_bank_dir) ] PDBs_to_download = list( set(self.pdb_ligand_json[current_fragment]['PDBs']) - set(existing_PDBs)) if len(PDBs_to_download) > 0: print(f'Downloading PDBs for {current_fragment}...\n') prody.fetchPDBviaFTP(*PDBs_to_download) else: print( f'All relevant PDBs for {current_fragment} found in {self.pdb_bank_dir}!\n' ) # Fragment_1, Fragment_2, ... for current_fragment in self.pdb_ligand_json: # Create directory for processed PDBs processed_dir = os.path.join(self.processed_PDBs_path, current_fragment) processed_dir_exists = os.path.exists(processed_dir) os.makedirs(processed_dir, exist_ok=True) # Get list of already processed PDBs for current_fragment already_processed_pdbs = [ file[:4].lower() for file in os.listdir(processed_dir) ] # Save ideal_ligand_containers for each fragment so things are only downloaded once ideal_ligand_dict = dict() ideal_ligand_dict['Ligands'] = dict() ideal_ligand_dict['Failed'] = list() # Align_PDB class holds all information for the current fragment align = Align_PDB(self.user_defined_dir, current_fragment, self.sanitized_smiles_dict[current_fragment], verify_substructure=verify_substructure) # Get PDB IDs that are viable for extracting protein-fragment contacts reject_pdbs = rejected_dict[ current_fragment] if current_fragment in rejected_dict.keys( ) else list() if not processed_dir_exists: reject_pdbs = list() reject_pdbs.append('3k87') # DEBUGGING viable_pdbs = list( set(self.pdb_ligand_json[current_fragment]['PDBs']) - set(reject_pdbs) - set(already_processed_pdbs)) # For each PDB containing a fragment-containing compound for pdbid in viable_pdbs: # Return path of PDB file to use for processing found_pdb, pdb_path = self.return_PDB_to_use_for_alignments( pdbid, use_local_pdb_database=use_local_pdb_database) if not found_pdb: print(f'Cannot find {pdbid}!') continue # Proceed with processing if the current PDB passes all filters print("\n\nProcessing {}...".format(pdbid)) # --- Check which ligands contain relevant fragments --- # relevant_ligands = self.return_substructure_containing_ligands( pdb_path, self.pdb_ligand_json, current_fragment) # Set things up! Get ligands from Ligand Expo if haven't already tried and failed for ligand in relevant_ligands: if not ideal_ligand_dict['Ligands'].get( ligand ) and ligand not in ideal_ligand_dict['Failed']: ideal_ligand_container = Ideal_Ligand_PDB_Container( ligand) if ideal_ligand_container.success: ideal_ligand_dict['Ligands'][ ligand] = ideal_ligand_container else: ideal_ligand_dict['Failed'].append(ligand) # Create a temp list for ligands that will be pulled from the current PDB ligand_container_dict_for_current_pdb = { lig: ideal_ligand_dict['Ligands'][lig] for lig in ideal_ligand_dict['Ligands'] if lig in relevant_ligands } relevant_ligands_prody_dict = align.extract_ligand_records( pdb_path, ligand_container_dict_for_current_pdb) # Reject if no ligands with all atoms represented can be found for the given PDB if len(relevant_ligands_prody_dict) < 1: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] print( 'REJECTED - no target ligands were fully represented in the PDB' ) continue # --- Perform alignment of PDB fragment substructure (mobile) onto defined fragment (target) --- # # ...if PDB has not been processed, rejected, or excluded by the user else: # Iterate over ligands found to contain fragments as substructures for ligand_resname, ligand_chain, ligand_resnum in relevant_ligands_prody_dict: # Mapping of fragment atoms to target ligand atoms target_ligand_ideal_smiles = ligand_container_dict_for_current_pdb[ ligand_resname].smiles # todo: catch ligands with missing SMILES strings earlier... if target_ligand_ideal_smiles is None: continue target_ligand_pdb_string = io.StringIO() target_ligand_prody = relevant_ligands_prody_dict[( ligand_resname, ligand_chain, ligand_resnum)].select('not hydrogen') prody.writePDBStream(target_ligand_pdb_string, target_ligand_prody) mapping_successful, fragment_target_map = align.fragment_target_mapping( target_ligand_ideal_smiles, target_ligand_pdb_string) if not mapping_successful: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] print( 'REJECTED - failed atom mapping between target and reference fragment' ) continue print( f'\n{len(fragment_target_map)} possible mapping(s) of fragment onto {pdbid}:{ligand} found...\n' ) # Iterate over possible mappings of fragment onto current ligand rmsd_success = False for count, mapping in enumerate(fragment_target_map): # todo: refactor to use RDKit's atom.GetMonomerInfo() for atom selections... # Determine translation vector and rotation matrix target_coords_and_serials, frag_atom_coords, transformation_matrix = align.determine_rotation_and_translation( mapping, target_ligand_prody) trgt_atom_coords, target_fragment_atom_serials = target_coords_and_serials # Apply transformation to protein_ligand complex if rmsd if below cutoff # Use information from PubChem fragment SMILES in determining correct mappings # Actually, map fragment onto source ligand and use valence information to determine correct mappings rmsd = prody.calcRMSD( frag_atom_coords, prody.applyTransformation( transformation_matrix, trgt_atom_coords)) print( 'RMSD of target onto reference fragment:\t{}'. format(rmsd)) if rmsd < rmsd_cutoff: transformed_pdb = align.apply_transformation( pdb_path, ligand_resnum, target_fragment_atom_serials, transformation_matrix) # Continue if transformed_pdb - ligand is None if transformed_pdb.select( f'not (resname {ligand_resname})' ) is None: continue transformed_pdb_name = f'{pdbid}_{ligand_resname}_{ligand_chain}_{ligand_resnum}-{count}.pdb' prody.writePDB( os.path.join(processed_dir, transformed_pdb_name), transformed_pdb) rmsd_success = True else: print( 'REJECTED - high RMSD upon alignment to reference fragment' ) if rmsd_success is False: if current_fragment in rejected_dict.keys(): rejected_dict[current_fragment].append(pdbid) else: rejected_dict[current_fragment] = [pdbid] # Remember rejected PDBs with open(self.rejected_dict_pickle, 'wb') as reject_pickle: pickle.dump(rejected_dict, reject_pickle)