def prody_align(opt): """Align models in a PDB file or a PDB file onto others.""" import prody LOGGER = prody.LOGGER args = opt.pdb if len(args) == 1: pdb = args[0] LOGGER.info('Aligning multiple models in: ' + pdb) selstr, prefix, model = opt.select, opt.prefix, opt.model pdb = prody.parsePDB(pdb) pdbselect = pdb.select(selstr) if pdbselect is None: opt.subparser.error('Selection {0:s} do not match any atoms.' .format(repr(selstr))) LOGGER.info('{0:d} atoms will be used for alignment.' .format(len(pdbselect))) pdbselect.setACSIndex(model-1) prody.printRMSD(pdbselect, msg='Before alignment ') prody.alignCoordsets(pdbselect) prody.printRMSD(pdbselect, msg='After alignment ') if prefix == '': prefix = pdb.getTitle() + '_aligned' outfn = prefix + '.pdb' LOGGER.info('Writing file: ' + outfn) prody.writePDB(outfn, pdb) else: reffn = args.pop(0) seqid=opt.seqid overlap=opt.overlap LOGGER.info('Aligning structures onto: ' + reffn) ref = prody.parsePDB(reffn) for arg in args: if arg == reffn: continue if '_aligned.pdb' in arg: continue pdb = prody.parsePDB(arg) result = prody.matchAlign(pdb, ref, seqid=seqid, overlap=overlap, tarsel=opt.select, allcsets=True, cslabel='Model', csincr=1) if result: outfn = pdb.getTitle() + '_aligned.pdb' LOGGER.info('Writing file: ' + outfn) prody.writePDB(outfn, pdb) else: LOGGER.warning('Failed to align ' + arg)
def prody_align(opt): """Align models in a PDB file or a PDB file onto others.""" import prody LOGGER = prody.LOGGER args = opt.pdb if len(args) == 1: pdb = args[0] LOGGER.info('Aligning multiple models in: ' + pdb) selstr, prefix, model = opt.select, opt.prefix, opt.model pdb = prody.parsePDB(pdb) pdbselect = pdb.select(selstr) if pdbselect is None: LOGGER.warning('Selection "{0:s}" do not match any atoms.' .format(selstr)) sys.exit(-1) LOGGER.info('{0:d} atoms will be used for alignment.' .format(len(pdbselect))) pdb.setACSIndex(model-1) prody.alignCoordsets(pdb, selstr=selstr) rmsd = prody.calcRMSD(pdb) LOGGER.info('Max RMSD: {0:0.2f} Mean RMSD: {1:0.2f}' .format(rmsd.max(), rmsd.mean())) if prefix == '': prefix = pdb.getTitle() + '_aligned' outfn = prefix + '.pdb' LOGGER.info('Writing file: ' + outfn) prody.writePDB(outfn, pdb) else: reffn = args.pop(0) LOGGER.info('Aligning structures onto: ' + reffn) ref = prody.parsePDB(reffn) for arg in args: if arg == reffn: continue if '_aligned.pdb' in arg: continue pdb = prody.parsePDB(arg) if prody.matchAlign(pdb, ref): outfn = pdb.getTitle() + '_aligned.pdb' LOGGER.info('Writing file: ' + outfn) prody.writePDB(outfn, pdb) else: LOGGER.warning('Failed to align ' + arg)
def prody_align(*pdbs, **kwargs): """Align models in a PDB file or multiple structures in separate PDB files. By default, protein chains will be matched based on selected atoms and alignment will be performed based on matching residues. If non-protein atoms are selected and selected atoms match in multiple structures, they will be used for alignment. :arg pdbs: PDB identifier(s) or filename(s) :arg select: atom selection string, default is :term:`calpha`, see :ref:`selections` :arg model: for NMR files, reference model index, default is ``1`` :arg seqid: percent sequence identity, default is ``90`` :arg overlap: percent sequence overlap, default is ``90`` :arg prefix: prefix for output file, default is PDB filename :arg suffix: output filename suffix, default is :file:`_aligned`""" from numpy import all from prody import LOGGER, writePDB, parsePDB from prody import alignCoordsets, printRMSD, matchAlign, superpose selstr = kwargs.get('select', 'calpha') suffix = kwargs.get('suffix', '_aligned') if len(pdbs) == 1: pdb = pdbs[0] LOGGER.info('Aligning multiple models in: ' + pdb) prefix = kwargs.get('prefix') model = kwargs.get('model') pdb = parsePDB(pdb) pdbselect = pdb.select(selstr) if pdbselect is None: subparser = kwargs.get('subparser') if subparser: subparser.error('Selection {0} do not match any atoms.'.format( repr(selstr))) else: raise ValueError('select does not match any atoms') LOGGER.info('{0} atoms will be used for alignment.'.format( len(pdbselect))) pdbselect.setACSIndex(model - 1) printRMSD(pdbselect, msg='Before alignment ') alignCoordsets(pdbselect) printRMSD(pdbselect, msg='After alignment ') outfn = (prefix or pdb.getTitle()) + suffix + '.pdb' LOGGER.info('Writing file: ' + outfn) writePDB(outfn, pdb) else: pdbs = list(pdbs) reffn = pdbs.pop(0) seqid = kwargs.get('seqid') overlap = kwargs.get('overlap') LOGGER.info('Aligning structures onto: ' + reffn) ref = parsePDB(reffn) ref_sel = ref.select(selstr) if ref_sel: LOGGER.info('Selection {0} matched {1} atoms.'.format( repr(selstr), len(ref_sel))) else: raise ValueError('selection {0} did not match any atoms'.format( repr(selstr))) match = True if ref_sel.numAtoms('ca') < 2: match = False for arg in pdbs: if arg == reffn: continue #if '_aligned.pdb' in arg: # continue LOGGER.info('Evaluating structure: ' + arg) pdb = parsePDB(arg) if match: result = matchAlign(pdb, ref, seqid=seqid, overlap=overlap, tarsel=selstr, allcsets=True, cslabel='Model', csincr=1) if result: outfn = pdb.getTitle() + suffix + '.pdb' LOGGER.info('Writing file: ' + outfn) writePDB(outfn, pdb) continue pdb_sel = pdb.select(selstr) LOGGER.info('Selection {0} matched {1} atoms.'.format( repr(selstr), len(pdb_sel))) if (len(pdb_sel) == len(ref_sel) and all(pdb_sel.getNames() == ref_sel.getNames())): printRMSD(ref_sel, pdb_sel, msg='Before alignment ') superpose(pdb_sel, ref_sel) printRMSD(ref_sel, pdb_sel, msg='After alignment ') outfn = pdb.getTitle() + suffix + '.pdb' LOGGER.info('Writing file: ' + outfn) writePDB(outfn, pdb) else: LOGGER.warn('Failed to align structure ' + arg + '.')
def prune_pdb_models(pdb_models): ''' This function takes a list of structural models corresponding to a single pdb ID (just isolated models). It prunes them to find representative models and eliminates redundant ones Arguments: pdb_models -- full list of pdb models (iso) Returns: pruned_models -- list of pruned representative pdb models ''' pruned_models = [] # determine which files actually exist, delete parent dirs of those that don't iso_pdb_models = [] for model in pdb_models: if not os.path.exists(model): print os.path.basename( model), 'does not exist! Deleting parent directory.' delete_model(model) else: iso_pdb_models.append(model) # find representative models rep_overlap_cutoff = 50 # percent seq overlap required (90% seq ID required) rep_rmsd_cutoff = 5 # models less than 4A apart are represented by a single model # find representative iso models print 'Finding representative PDB ISO models...' rep_iso_models = [] for iso_model in iso_pdb_models: if len(rep_iso_models) == 0: rep_iso_models.append(iso_model) else: model = prody.parsePDB(iso_model) # get structure redundant = False for rep_iso_model in rep_iso_models: rep = prody.parsePDB(rep_iso_model) # get structure # calc RMSD between model and rep alignment = prody.matchAlign(model, rep, overlap=rep_overlap_cutoff) if alignment != None: rmsd = prody.calcRMSD(alignment[1], alignment[2]) if rmsd <= rep_rmsd_cutoff: redundant = True # we already have a representative for this segment # take the larger structure as the representative if model.numResidues() > rep.numResidues(): rep_iso_models.remove(rep_iso_model) rep_iso_models.append(iso_model) break # if the iso model does not match any of our representative models, # then add it to the representative models list if not redundant: rep_iso_models.append(iso_model) print 'Found', len(rep_iso_models), 'representative ISO models:', map( os.path.basename, rep_iso_models) # move representative models to their own directory if len(rep_iso_models) > 0: pdb_dir = os.path.abspath( os.path.join(rep_iso_models[0], os.pardir + '/' + os.pardir)) rep_model_dir = pdb_dir + '/representative_pdb_models/' if os.path.exists(rep_model_dir): shutil.rmtree(rep_model_dir) os.mkdir(rep_model_dir) for rep_iso_model in rep_iso_models: rep_iso_model_pardir = os.path.abspath( os.path.join(rep_iso_model, os.pardir)) new_path = rep_model_dir + '/' + os.path.basename( rep_iso_model_pardir) shutil.copytree(rep_iso_model_pardir, new_path) # define new pathname to keep track of the models once we move them new_iso_model_path = rep_model_dir + os.path.basename( rep_iso_model_pardir) + '/' + os.path.basename(rep_iso_model) pruned_models.append(new_iso_model_path) # return all representative pdb models return pruned_models
#debug #pairs = pairs[:2] def select_chains(atoms, chains): only_letters = lambda c: c.isalpha() chains = filter(only_letters, list(''.join(chains))) return atoms.select('protein and ('+' or '.join(['chain '+c.upper() for c in chains]) + ')') for p in pairs: (bound_pdb, bound_chn, peptide_chn, unbound_pdb, unbound_chn) = p bound = select_chains(parsePDB(bound_pdb), bound_chn+peptide_chn) unbound = select_chains(parsePDB(unbound_pdb), unbound_chn) align_results = matchAlign(bound, unbound) if not align_results: #raise IOError('cannot align bound and unbound') continue bound = align_results[0] bound_r = select_chains(bound, bound_chn) unbound_r = select_chains(unbound, unbound_chn) peptide = select_chains(bound, peptide_chn) writePDB('bound/%s.%s.pdb' % (bound_pdb, bound_chn), bound_r) #writePDB('%s.%s.pdb' % (unbound_pdb, unbound_chn), unbound_r) writePDB('%s.receptor.pdb' % unbound_pdb, unbound_r) writePDB('%s.peptide.pdb' % unbound_pdb, peptide)
def generateTrainingSet(inputdict, distance, output=None, combineOutput=True): devnull = open(os.devnull, 'w') subprocess.check_call('dssp --version', shell=True, stdout=devnull, stderr=devnull) subprocess.check_call('stride -h', shell=True, stdout=devnull, stderr=devnull) subprocess.check_call('netsurfp -h', shell=True, stdout=devnull, stderr=devnull) subprocess.check_call('runpsipred', shell=True, stdout=devnull, stderr=devnull) devnull.close() finalData = pd.DataFrame() if combineOutput else [] for target, models in inputdict.items(): targetPDB = prody.parsePDB(target) assert distance, "Distance is not valid" tempdir = tempfile.mkdtemp() # we don't want to run NetSurfP and PSIPRED over and over again for all # model structures. we compute them for target structure and just reuse # on model structures netsurfp.parseNetSurfP(netsurfp.execNetSurfP(target, outputdir=tempdir), targetPDB) psipred.parsePSIPRED(psipred.execPSIPRED(target, outputdir=tempdir), targetPDB) for i, modelFilename in enumerate(models): datadict = {} modelPDB = prody.parsePDB(modelFilename) if not modelPDB: print('Model file %s cannot be parsed, skipping...' % modelFilename) continue #if model has no chainID, let's assign one. That makes STRIDE parser #happy if np.unique(modelPDB.getChids()) == ' ': modelPDB.all.setChids('A') modelFilename = os.path.join(tempdir, os.path.basename(modelFilename)) modelFilename = prody.writePDB(modelFilename, modelPDB, autoext=False) #superimpose model onto target structure match = prody.matchAlign(modelPDB, targetPDB, tarsel='calpha', seqid=50, overlap=20) mapmodel = match[1] maptarget = match[2] #and copy NetSurfP and PSIPRED data from target to model copyDataFromTarget(targetPDB, modelPDB) #run STRIDE prody.parseSTRIDE(prody.execSTRIDE(modelFilename, outputdir=tempdir), modelPDB) datadict['STRIDEarea'] = \ pd.Series(modelPDB.ca.getData('stride_area')[mapmodel.getResindices()], index=maptarget.getResindices()) ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices()) ss[ss == ''] = '-' #empty strings cause trouble in csv load/save datadict['STRIDEss'] = ss #run DSSP prody.parseDSSP(prody.execDSSP(modelFilename, outputdir=tempdir), modelPDB) datadict['DSSPacc'] = \ pd.Series(modelPDB.ca.getData('dssp_acc')[mapmodel.getResindices()], index=maptarget.getResindices()) ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices()) ss[ss == ''] = '-' #empty strings cause trouble in csv load/save datadict['DSSPss'] = ss #save NetSurfP data datadict['NetSurfP_exp'] = \ pd.Series(modelPDB.ca.getData('netsurfp_exposure')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_asa'] = \ pd.Series(modelPDB.ca.getData('netsurfp_asa')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_rsa'] = \ pd.Series(modelPDB.ca.getData('netsurfp_rsa')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_alpha'] = \ pd.Series(modelPDB.ca.getData('netsurfp_alphascore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_beta'] = \ pd.Series(modelPDB.ca.getData('netsurfp_betascore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['NetSurfP_coil'] = \ pd.Series(modelPDB.ca.getData('netsurfp_coilscore')[mapmodel.getResindices()], index=maptarget.getResindices()) #save PSIPRED data datadict['PSIPRED_ss'] = \ pd.Series(modelPDB.ca.getData('psipred_ss')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['PSIPRED_coilscore'] = \ pd.Series(modelPDB.ca.getData('psipred_coilscore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['PSIPRED_helixscore'] = \ pd.Series(modelPDB.ca.getData('psipred_helixscore')[mapmodel.getResindices()], index=maptarget.getResindices()) datadict['PSIPRED_strandscore'] = \ pd.Series(modelPDB.ca.getData('psipred_strandscore')[mapmodel.getResindices()], index=maptarget.getResindices()) #Compute class labels based on the distance argument datadict['ClassLabel'] = pd.Series((np.abs( prody.calcDistance(maptarget.copy(), mapmodel.copy())) < distance).astype(int), index=maptarget.getResindices()) if combineOutput: finalData = pd.concat([finalData, pd.DataFrame(datadict)]) else: finalData.append(pd.DataFrame(datadict)) #remove temporary directory shutil.rmtree(tempdir, ignore_errors=True) if output: if combineOutput: finalData.to_csv(output, index=False, quoting=csv.QUOTE_NONNUMERIC) #dataframe.to_csv(output) else: print('Warning! Output must be combined to be saved into a CSV ' 'file') return finalData
def find_rep_gene_iso_models(hi_res_iso_models, lo_res_iso_models, rep_rmsd_cutoff): ''' Function to pick representative iso gene models from a pool of representative pdb iso models. Uses a greedy algorithm to cover as much of the gene sequence as possible using first high resolution models and then filling any gaps with low resolution models ''' # only make a model a representative model if is at least 15 residues # long and if it includes at least 10 # residues that have never been seen in previous models or if it has # a significantly different conformation than previous models rep_gene_iso_models = [] min_length = 20 num_new_residue_cutoff = 10 rep_overlap_cutoff = 10 # tag each model with it's sequence coverage hi_res_iso_models = [[m, get_seq_range(m)] for m in hi_res_iso_models] lo_res_iso_models = [[m, get_seq_range(m)] for m in lo_res_iso_models] # sort lists of models by length of sequence coverage sorted_hi_res_iso_models = sorted(hi_res_iso_models, key=lambda m: -1 * len(m[1])) sorted_lo_res_iso_models = sorted(lo_res_iso_models, key=lambda m: -1 * len(m[1])) sorted_iso_models = sorted_hi_res_iso_models + sorted_lo_res_iso_models # use greedy algorithm to try to cover full gene sequence gene_coverage = [] # start with large hi res models, end with small lo res models for model in sorted_iso_models: model_file = model[0] model_coverage = model[1] # discrard structures that have too few number of residues if len(model_coverage) >= min_length: intersection = list(set(model_coverage) & set(gene_coverage)) num_new_residues = len(model_coverage) - len(intersection) # if rep model list is empty, make it a rep model if len(rep_gene_iso_models) == 0: rep_gene_iso_models.append(model_file) gene_coverage += model_coverage # otherwise, if this model has enough new residues, add it to the representatives list elif num_new_residues >= num_new_residue_cutoff: rep_gene_iso_models.append(model_file) gene_coverage += model_coverage gene_coverage = list(set(gene_coverage)) # otherwise check if it has a unique conformation else: model_struct = prody.parsePDB(model_file) redundant = False for rep_gene_iso_model in rep_gene_iso_models: rep_struct = prody.parsePDB( rep_gene_iso_model) # get structure # calc RMSD between model and rep alignment = prody.matchAlign(model_struct, rep_struct, overlap=rep_overlap_cutoff) if alignment != None: rmsd = prody.calcRMSD(alignment[1], alignment[2]) if rmsd <= rep_rmsd_cutoff: redundant = True # we already have a representative for this segment break # if the model does not match any of our representative models, # then it is unique - add it to the representative models list if not redundant: rep_gene_iso_models.append(model_file) gene_coverage += model_coverage gene_coverage = list(set(gene_coverage)) return rep_gene_iso_models
def prody_align(*pdbs, **kwargs): """Align models in a PDB file or multiple structures in separate PDB files. By default, protein chains will be matched based on selected atoms and alignment will be performed based on matching residues. If non-protein atoms are selected and selected atoms match in multiple structures, they will be used for alignment. :arg pdbs: PDB identifier(s) or filename(s) :arg select: atom selection string, default is :term:`calpha`, see :ref:`selections` :arg model: for NMR files, reference model index, default is ``1`` :arg seqid: percent sequence identity, default is ``90`` :arg overlap: percent sequence overlap, default is ``90`` :arg prefix: prefix for output file, default is PDB filename :arg suffix: output filename suffix, default is :file:`_aligned`""" from numpy import all from prody import LOGGER, writePDB, parsePDB from prody import alignCoordsets, printRMSD, matchAlign, superpose selstr = kwargs.get('select', 'calpha') suffix = kwargs.get('suffix', '_aligned') if len(pdbs) == 1: pdb = pdbs[0] LOGGER.info('Aligning multiple models in: ' + pdb) prefix = kwargs.get('prefix') model = kwargs.get('model') pdb = parsePDB(pdb) pdbselect = pdb.select(selstr) if pdbselect is None: subparser = kwargs.get('subparser') if subparser: subparser.error('Selection {0} do not match any atoms.' .format(repr(selstr))) else: raise ValueError('select does not match any atoms') LOGGER.info('{0} atoms will be used for alignment.' .format(len(pdbselect))) pdbselect.setACSIndex(model-1) printRMSD(pdbselect, msg='Before alignment ') alignCoordsets(pdbselect) printRMSD(pdbselect, msg='After alignment ') outfn = (prefix or pdb.getTitle()) + suffix + '.pdb' LOGGER.info('Writing file: ' + outfn) writePDB(outfn, pdb) else: pdbs = list(pdbs) reffn = pdbs.pop(0) seqid = kwargs.get('seqid') overlap = kwargs.get('overlap') LOGGER.info('Aligning structures onto: ' + reffn) ref = parsePDB(reffn) ref_sel = ref.select(selstr) if ref_sel: LOGGER.info('Selection {0} matched {1} atoms.' .format(repr(selstr), len(ref_sel))) else: raise ValueError('selection {0} did not match any atoms' .format(repr(selstr))) match = True if ref_sel.numAtoms('ca') < 2: match = False for arg in pdbs: if arg == reffn: continue #if '_aligned.pdb' in arg: # continue LOGGER.info('Evaluating structure: ' + arg) pdb = parsePDB(arg) if match: result = matchAlign(pdb, ref, seqid=seqid, overlap=overlap, tarsel=selstr, allcsets=True, cslabel='Model', csincr=1) if result: outfn = pdb.getTitle() + suffix + '.pdb' LOGGER.info('Writing file: ' + outfn) writePDB(outfn, pdb) continue pdb_sel = pdb.select(selstr) LOGGER.info('Selection {0} matched {1} atoms.' .format(repr(selstr), len(pdb_sel))) if (len(pdb_sel) == len(ref_sel) and all(pdb_sel.getNames() == ref_sel.getNames())): printRMSD(ref_sel, pdb_sel, msg='Before alignment ') superpose(pdb_sel, ref_sel) printRMSD(ref_sel, pdb_sel, msg='After alignment ') outfn = pdb.getTitle() + suffix + '.pdb' LOGGER.info('Writing file: ' + outfn) writePDB(outfn, pdb) else: LOGGER.warn('Failed to align structure ' + arg + '.')