def _superpose(self, **kwargs): """Superpose conformations and update coordinates.""" indices = self._indices weights = self._weights mobs = self._confs if indices is None: idx = False tar = self._coords movs = None else: idx = True if self._weights is not None: weights = weights[indices] tar = self._coords[indices] movs = self._confs linalg = importLA() svd = linalg.svd det = linalg.det if weights is None: tar_com = tar.mean(0) tar_org = (tar - tar_com) mob_org = zeros(tar_org.shape, dtype=mobs.dtype) tar_org = tar_org.T else: weights_sum = weights.sum() weights_dot = dot(weights.T, weights) tar_com = (tar * weights).sum(axis=0) / weights_sum tar_org = (tar - tar_com) mob_org = zeros(tar_org.shape, dtype=mobs.dtype) LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble') for i, mob in enumerate(mobs): if idx: mob = mob[indices] if weights is None: mob_com = mob.mean(0) matrix = dot(tar_org, subtract(mob, mob_com, mob_org)) else: mob_com = (mob * weights).sum(axis=0) / weights_sum subtract(mob, mob_com, mob_org) matrix = dot((tar_org * weights).T, (mob_org * weights)) / weights_dot U, s, Vh = svd(matrix) Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]]) rotation = dot(Vh.T, dot(Id, U.T)) if movs is None: mobs[i] = dot(mob_org, rotation) add(mobs[i], tar_com, mobs[i]) else: add(dot(movs[i], rotation), (tar_com - dot(mob_com, rotation)), movs[i]) LOGGER.update(i, '_prody_ensemble') LOGGER.clear()
def searchDali(pdbId, chainId, daliURL=None, subset='fullPDB', **kwargs): """Search Dali server with input of PDB ID and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" pdbId = pdbId.lower() pdb_chain = pdbId + chainId parameters = { 'cd1': pdb_chain, 'method': 'search', 'title': 'Title_' + pdb_chain, 'address': '' } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request(daliURL, enc_params) try_error = 3 while try_error >= 0: try: url = urllib2.urlopen(request).url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep( 2, '. Connection error happened. Trying to reconnect...') continue else: url = urllib2.urlopen(request).url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug( 'Submitted Dali search for PDB and chain "{0} and {1}".'.format( pdbId, chainId)) LOGGER.info(url) LOGGER.clear() obj = DaliRecord(url, pdbId, chainId, subset=subset, timeout=timeout, **kwargs) #if obj.isSuccess: return obj
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def fetch(self, url=None, localFile=False, **kwargs): if localFile: emsurfer_file = open(url, 'r') data = emsurfer_file.read() emsurfer_file.close() else: import requests if url == None: url = self._url html = requests.get(url).content if PY3K: html = html.decode() LOGGER.clear() LOGGER.report('Emsurfer results were fetched in %.1fs.', '_emsurfer') data = html.strip().split('\n') data_list = [] for line in data[3:-2]: data_list.append(tuple(line.split('\t'))) # Rank EMDB_ID EUC_D RESOLUTION emsurferInfo = np.array(data_list, dtype=[('Rank', '<i4'), ('EMDB_ID', '<U70'), ('EUC_D', '<f4'), ('RESOLUTION', '<f4')]) emdListAll = [] self._emsurferInfo = emsurferInfo emsurfer_temp_dict = dict() for temp in self._emsurferInfo: temp_dict = dict() temp_dict['Rank'] = temp[0] temp_dict['EMDB_ID'] = emdbId = temp[1] temp_dict['EUC_D'] = temp[2] temp_dict['RESOLUTION'] = temp[3] emsurfer_temp_dict[emdbId] = temp_dict emdListAll.append(emdbId) self._emdListAll = tuple(emdListAll) self._emdList = self._emdListAll self._alignEMD = emsurfer_temp_dict LOGGER.info('Obtained ' + str(len(emdListAll)) + ' EMD matches from Emsurfer for '+self._emdId+'.') return True
def calcMSF(coordsets): """Calculate mean square fluctuation(s) (MSF).""" try: ncsets = coordsets.numFrames() except AttributeError: try: coordsets = coordsets.getCoordsets() except AttributeError: pass try: ndim, shape = coordsets.ndim, coordsets.shape except: raise TypeError('coordsets must be a Numpy array or a ProDy ' 'object with `getCoordsets` method') if ndim != 3 or shape[0] == 1: raise ValueError('coordsets must contain multiple sets') msf = var(coordsets, 0).sum(1) else: nfi = coordsets.nextIndex() natoms = coordsets.numSelected() total = zeros((natoms, 3)) sqsum = zeros((natoms, 3)) LOGGER.progress('Evaluating {0} frames from {1}:' .format(ncsets, str(coordsets)), ncsets, '_prody_calcMSF') ncsets = 0 coordsets.reset() for frame in coordsets: frame.superpose() coords = frame._getCoords() total += coords sqsum += coords ** 2 ncsets += 1 LOGGER.update(ncsets, '_prody_calcMSF') msf = (sqsum/ncsets - (total/ncsets)**2).sum(1) LOGGER.clear() coordsets.goto(nfi) return msf
def calcMSF(coordsets): """Calculate mean square fluctuation(s) (MSF).""" try: ncsets = coordsets.numFrames() except AttributeError: try: coordsets = coordsets.getCoordsets() except AttributeError: pass try: ndim, shape = coordsets.ndim, coordsets.shape except: raise TypeError('coordsets must be a Numpy array or a ProDy ' 'object with `getCoordsets` method') if ndim != 3 or shape[0] == 1: raise ValueError('coordsets must contain multiple sets') msf = var(coordsets, 0).sum(1) else: nfi = coordsets.nextIndex() natoms = coordsets.numSelected() total = zeros((natoms, 3)) sqsum = zeros((natoms, 3)) LOGGER.progress( 'Evaluating {0} frames from {1}:'.format(ncsets, str(coordsets)), ncsets, '_prody_calcMSF') ncsets = 0 coordsets.reset() for frame in coordsets: frame.superpose() coords = frame._getCoords() total += coords sqsum += coords**2 ncsets += 1 LOGGER.update(ncsets, '_prody_calcMSF') msf = (sqsum / ncsets - (total / ncsets)**2).sum(1) LOGGER.clear() coordsets.goto(nfi) return msf
def blastPDBUniProtKB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. *num_sequences (default is ``1``) """ num_sequences = int(kwargs.pop('num_sequences', 1)) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: if num_sequences == 1: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-5)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') psiblast = 'true' step_number = 3 query.append(('RUN_PSIBLAST', psiblast)) query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) query.append(('STEP_NUMBER', step_number)) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'name="RID" type="hidden" value="') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'>',index) rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip() index = html.find(b'name="RTOE" type="hidden" value="') if index == -1: rtoe = None # This is not used else: last = html.find(b'>', index) rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return SwissProtBlastRecord(results, sequence)
def searchDali(pdb, chain=None, subset='fullPDB', daliURL=None, **kwargs): """Search Dali server with input of PDB ID (or local PDB file) and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg pdb: PDB code or local PDB file for the protein to be searched :arg chain: chain identifier (only one chain can be assigned for PDB) :type chain: str :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ import requests LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" if isinstance(pdb, Atomic): atoms = pdb chain_set = set(atoms.getChids()) if chain and not chain in chain_set: raise ValueError('input structure (%s) does not have chain %s'%(atoms.getTitle(), chain)) if len(chain_set) > 1: if not chain: raise TypeError('the structure (%s) contains more than one chain, therefore a chain identifier ' 'needs to be specified'%pdb.getTitle()) atoms = atoms.select('chain '+chain) else: chain = chain_set.pop() stream = createStringIO() writePDBStream(stream, atoms) data = stream.getvalue() stream.close() files = {"file1" : data} pdbId = atoms.getTitle() pdb_chain = '' dali_title = 'Title_'+pdbId+chain elif isinstance(pdb, str): if os.path.isfile(pdb): atoms = parsePDB(pdb) chain_set = set(atoms.getChids()) # pdbId = "s001" filename = os.path.basename(pdb) filename, ext = os.path.splitext(filename) if ext.lower() == '.gz': filename2, ext2 = os.path.splitext(filename) if ext2.lower() == '.pdb': filename = filename2 pdbId = filename if chain and not chain in chain_set: raise ValueError('input PDB file does not have chain ' + chain) if len(chain_set) > 1: if not chain: raise TypeError('PDB file (%s) contains more than one chain, therefore a chain identifier ' 'needs to be specified'%pdb) atoms = atoms.select('chain '+chain) #local_temp_pdb = pdbId+chain+'.pdb' #local_temp_pdb = 's001'+chain+'.pdb' stream = createStringIO() writePDBStream(stream, atoms) data = stream.getvalue() stream.close() else: data = open(pdb, "rb") chain = chain_set.pop() files = {"file1" : data} # case: multiple chains. apply fetch ? multiple times? pdb_chain = '' dali_title = 'Title_' + pdbId + chain else: pdbId, ch = _getPDBid(pdb) if not chain: chain = ch if not chain: raise TypeError('a chain identifier is needed for the search') pdb_chain = pdbId + chain dali_title = 'Title_' + pdb_chain files = '' parameters = { 'cd1' : pdb_chain, 'method': 'search', 'title': dali_title, 'address': '' } # enc_params = urllib.urlencode(parameters).encode('utf-8') # request = urllib2.Request(daliURL, enc_params) request = requests.post(daliURL, parameters, files=files) try_error = 3 while try_error >= 0: try: # url = urllib2.urlopen(request).url url = request.url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: # url = urllib2.urlopen(request).url url = request.url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug('Submitted Dali search for PDB "{0}{1}".'.format(pdbId, chain)) LOGGER.info(url) LOGGER.clear() return DaliRecord(url, pdbId, chain, subset=subset, timeout=timeout, **kwargs)
def fetch(self, url=None, localFile=False, **kwargs): """Get Dali record from url or file. :arg url: url of Dali results page or local dali results file If None then the url already associated with the DaliRecord object is used. :type url: str :arg localFile: whether provided url is a path for a local dali results file :type localFile: bool :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int :arg localfolder: folder in which to find the local file default is the current folder :type localfolder: str """ if localFile: dali_file = open(url, 'r') data = dali_file.read() dali_file.close() else: import requests if url == None: url = self._url sleep = 2 timeout = kwargs.pop('timeout', 120) LOGGER.timeit('_dali') log_message = '' try_error = 3 while True: LOGGER.write('Connecting to Dali for search results...') LOGGER.clear() try: # html = urllib2.urlopen(url).read() html = requests.get(url).content except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: # html = urllib2.urlopen(url).read() html = requests.get(url).content if PY3K: html = html.decode() if html.find('Status: Queued') > -1: log_message = '(Dali search is queued)...' elif html.find('Status: Running') > -1: log_message = '(Dali search is running)...' elif html.find('Your job') == -1 and html.find('.txt') > -1: break elif html.find('ERROR:') > -1: LOGGER.warn(': Dali search reported an ERROR!') return False sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) if LOGGER.timing('_dali') > timeout: LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.') return False LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message) LOGGER.clear() LOGGER.clear() LOGGER.report('Dali results were fetched in %.1fs.', '_dali') lines = html.strip().split('\n') file_name = re.search('=.+-90\\.txt', html).group()[1:] file_name = file_name[:-7] # LOGGER.info(url+file_name+self._subset+'.txt') # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read() data = requests.get(url+file_name+self._subset+'.txt').content if PY3K: data = data.decode() localfolder = kwargs.pop('localfolder', '.') if file_name.lower().startswith('s001'): temp_name = self._pdbId + self._chain else: temp_name = file_name temp_name += self._subset + '_dali.txt' if localfolder != '.' and not os.path.exists(localfolder): os.mkdir(localfolder) with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data) # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data) data_list = data.strip().split('# ') # No: Chain Z rmsd lali nres %id PDB Description -> data_list[3] # Structural equivalences -> data_list[4] # Translation-rotation matrices -> data_list[5] map_temp_dict = dict() lines = data_list[4].strip().split('\n') self._lines_4 = lines mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4') # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b] for map_i in mapping_temp: if not map_i[0] in map_temp_dict: map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]] else: map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]]) self._max_index = max(mapping_temp[:,2]) self._mapping = map_temp_dict self._data = data_list[3] lines = data_list[3].strip().split('\n') # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')]) daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')]) if daliInfo.ndim == 0: daliInfo = np.array([daliInfo]) pdbListAll = [] self._daliInfo = daliInfo dali_temp_dict = dict() for temp in self._daliInfo: temp_dict = dict() pdb_chain = temp[1].strip()[0:6] # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo # if PY3K: # pdb_chain = pdb_chain.decode() pdb_chain = str(pdb_chain) temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower() temp_dict['chainId'] = chid = pdb_chain[5:6] temp_dict['pdb_chain'] = pdb_chain = pdbid + chid temp_dict['Z'] = temp[2] temp_dict['rmsd'] = temp[3] temp_dict['len_align'] = temp[4] temp_dict['nres'] = temp[5] temp_dict['identity'] = temp[6] temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist() temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)] temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)] dali_temp_dict[pdb_chain] = temp_dict pdbListAll.append(pdb_chain) self._pdbListAll = tuple(pdbListAll) self._pdbList = self._pdbListAll self._alignPDB = dali_temp_dict LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.') return True
def searchDali(pdb, chainId, isLocal=False, subset='fullPDB', daliURL=None, **kwargs): """Search Dali server with input of PDB ID (or local PDB file) and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg pdb: PDB code or local PDB file for searched protein :arg chainId: chain identifier (only one chain can be assigned for PDB) :arg isLocal: submit a local PDB file instead of a PDB code when **True** :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ import requests LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" if len(chainId) != 1: raise ValueError('input PDB chain identifier ' + chainId + ' is invalid') if isLocal: if not os.path.isfile(pdb): raise ValueError('input PDB file ' + pdb + ' does not exist ') atom = parsePDB(pdb) chain_set = set(atom.getChids()) # pdbId = "s001" pdbId = '.'.join(pdb.split(os.sep)[-1].split('.')[0:-1]) if not chainId in chain_set: raise ValueError('input PDB file does not have chain ' + chainId) elif len(chain_set) > 1: atom = atom.select('chain ' + chainId) # local_temp_pdb = pdbId+chainId+'.pdb' local_temp_pdb = 's001' + chainId + '.pdb' writePDB(local_temp_pdb, atom) else: local_temp_pdb = pdb files = {"file1": open(local_temp_pdb, "rb")} # case: multiple chains. apply getRecord ? multiple times? pdb_chain = '' dali_title = 'Title_' + pdbId + chainId else: pdbId = pdb.lower() if len(pdbId) != 4: raise ValueError('input PDB code ' + pdb + ' is invalid') files = '' pdb_chain = pdbId + chainId dali_title = 'Title_' + pdb_chain parameters = { 'cd1': pdb_chain, 'method': 'search', 'title': dali_title, 'address': '' } # enc_params = urllib.urlencode(parameters).encode('utf-8') # request = urllib2.Request(daliURL, enc_params) request = requests.post(daliURL, parameters, files=files) try_error = 3 while try_error >= 0: try: # url = urllib2.urlopen(request).url url = request.url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep( 2, '. Connection error happened. Trying to reconnect...') continue else: # url = urllib2.urlopen(request).url url = request.url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug( 'Submitted Dali search for PDB and chain "{0} and {1}".'.format( pdbId, chainId)) LOGGER.info(url) LOGGER.clear() obj = DaliRecord(url, pdbId, chainId, subset=subset, timeout=timeout, **kwargs) #if obj.isSuccess: return obj
def runStep(self, **kwargs): """Run a single step of adaptive ANM. Modes will be calculated for *structA* and the subset with a cumulative overlap above a threshold defined by *Fmin* is used for transitioning towards *structB*. By default this function uses values from initialisation but they can be over-ridden if desired. For example, in bi-directional adaptive ANM, we switch *structA* and *structB*, *alignSelA* and *alignSelB*, and *reduceSelA* and *reduceSelB* """ structA = kwargs.pop('structA', self.structA) structB = kwargs.pop('structA', self.structB) alignSel = kwargs.pop('alignSel', self.alignSel) alignSelA = kwargs.pop('alignSelA', self.alignSelA) alignSelB = kwargs.pop('alignSelB', self.alignSelB) reduceSel = kwargs.pop('reduceSel', self.reduceSel) reduceSelA = kwargs.pop('reduceSelA', self.reduceSelA) reduceSelB = kwargs.pop('reduceSelB', self.reduceSelB) if reduceSelA is None: reduceSelA = reduceSel if reduceSelB is None: reduceSelB = reduceSel if alignSelA is None: if alignSelA is None: alignSelA = reduceSelA if alignSelB is None: alignSelB = reduceSelB else: if alignSelA is None: alignSelA = alignSel if alignSelB is None: alignSelB = alignSel Fmin = kwargs.get('Fmin', self.Fmin) f = kwargs.get('f', self.f) outputDCD = kwargs.get('outputDCD', self.outputDCD) outputPDB = kwargs.get('outputPDB', self.outputPDB) filename = kwargs.get('filename', self.filename) LOGGER.info('\nStarting cycle {0} with initial structure {1}'.format( self.numSteps + 1, structA)) mapping_func = kwargs.get('mapping_func', mapOntoChains) if alignSelA is None: structA_sel = structA else: structA_sel = structA.select(alignSelA) if alignSelB is None: structB_sel = structB else: structB_sel = structB.select(alignSelB) mapping_func = kwargs.pop('mapping_func', self.mapping_func) seqid = kwargs.pop('seqid', self.seqid) coverage = kwargs.pop('overlap', self.coverage) coverage = kwargs.pop('coverage', coverage) pwalign = kwargs.pop('pwalign', self.pwalign) pwalign = kwargs.pop('mapping', pwalign) try: _, T = superpose(structA_sel, structB_sel) structA = applyTransformation(T, structA) except: structB_amap = sum( np.array( mapping_func(structB_sel, structA_sel, overlap=coverage, seqid=seqid, pwalign=pwalign))[:, 0]) _, T = superpose(structA_sel, structB_amap) structA = applyTransformation(T, structA) maxModes = kwargs.get('maxModes', self.maxModes) if not isinstance(maxModes, (int, float)): raise TypeError('maxModes should be an integer or float') if maxModes < 1: maxModes = int(maxModes * 3 * self.structA.numAtoms() - 6) if maxModes > 3 * self.structA.numAtoms() - 6: maxModes = 3 * self.structA.numAtoms() - 6 if self.n_modes > maxModes: self.n_modes = maxModes trim = kwargs.pop('trim', self.trim) anmA, _ = calcENM(structA, n_modes=self.n_modes) if trim == 'slice': trim_anmA, _ = sliceModel(anmA, structA, reduceSelA) elif trim == 'reduce': trim_anmA, _ = reduceModel(anmA, structA, reduceSelA) trim_anmA.calcModes(n_modes=self.n_modes) else: trim_anmA = anmA coordsA = structA.getCoords() coordsA_sel = structA_sel.getCoords() coordsB_sel = structB_sel.getCoords() defvec = coordsB_sel - coordsA_sel d = defvec.flatten() self.dList.append(d) if Fmin is None: if self.numSteps == 0 or self.resetFmin: Fmin = 0. # Select the first mode only else: Fmin = 1 - np.sqrt( np.linalg.norm(self.dList[self.numSteps]) / np.linalg.norm(self.dList[0])) if Fmin > self.Fmin_max: Fmin = self.Fmin_max LOGGER.info( 'Fmin is {:4.3f}, corresponding to a cumulative overlap of {:4.3f}' .format(Fmin, np.sqrt(Fmin))) trim_d = sliceAtomicData(d, structA_sel, reduceSelA) overlaps = np.dot(trim_d, trim_anmA.getEigvecs()) overlap_sorting_indices = list( reversed(list(np.argsort(abs(overlaps))))) overlaps = overlaps[overlap_sorting_indices] if trim == 'reduce': sliced_anmA, _ = sliceModel(anmA, structA, reduceSelA) modesetA = ModeSet(trim_anmA, overlap_sorting_indices) _, overlap_sorting_indices = matchModes(modesetA, sliced_anmA, index=True) modesetA = ModeSet(anmA, overlap_sorting_indices) normalised_overlaps = overlaps / np.linalg.norm(d) c_sq = np.cumsum(np.power(normalised_overlaps, 2), axis=0) modesCrossingFmin = np.where(c_sq <= Fmin)[0] numModes = len(modesCrossingFmin) if numModes == 0: numModes = 1 modesCrossingFmin = [0] self.numModesList.append(numModes) if numModes == 1: LOGGER.info('Using 1 mode with overlap {0} (Mode {1})'.format( '{:4.3f}'.format(np.sqrt(c_sq[0])), modesetA.getIndices()[0] + 1)) elif numModes < 11: LOGGER.info( 'Using {0} modes with cumulative overlap {1} (Modes {2} and {3})' .format( numModes, '{:4.3f}'.format(np.sqrt(c_sq[numModes - 1])), ', '.join([ str(entry) for entry in modesetA.getIndices()[:numModes - 1] + 1 ]), str(modesetA.getIndices()[numModes - 1] + 1))) else: LOGGER.info( 'Using {0} modes with cumulative overlap {1} (Modes {2}, ... and {3}) with max mode number {4} and min mode number {5}' .format( numModes, '{:4.3f}'.format(np.sqrt(c_sq[numModes - 1])), ', '.join([ str(entry) for entry in modesetA.getIndices()[:10] + 1 ]), str(modesetA.getIndices()[numModes - 1] + 1), np.max(modesetA.getIndices()[:numModes] + 1), np.min(modesetA.getIndices()[:numModes] + 1))) if np.max(modesetA.getIndices()[:numModes]) > self.n_modes - 5: self.n_modes *= 10 if self.n_modes > 3 * self.structA.numAtoms() - 6: self.n_modes = 3 * self.structA.numAtoms() - 6 v = np.sum(np.multiply(overlaps[:numModes], modesetA.getEigvecs()[:, :numModes]), axis=1).reshape(coordsA.shape) trim_v = sliceAtomicData(v.reshape(-1), structA, reduceSelA).reshape(-1, 3) s_min = sum(np.multiply(trim_v.flatten(), trim_d)) / sum( np.power(trim_v.flatten(), 2)) new_coordsA = coordsA + f * s_min * v if structA == self.structA: self.anmA = anmA self.anmListA.append(modesetA) self.structA.setCoords(new_coordsA) self.ensembleA.addCoordset(new_coordsA) self.whichModesA.append(modesetA[modesCrossingFmin]) elif structA == self.structB: self.anmB = anmA self.anmListB.append(modesetA) self.structB.setCoords(new_coordsA) self.ensembleB.addCoordset(new_coordsA) self.whichModesB.append(modesetA[modesCrossingFmin]) new_coordsA_reduceSel = structA.select(reduceSelA).getCoords() coordsB_reduceSel = structB.select(reduceSelB).getCoords() rmsd = calcRMSD(new_coordsA_reduceSel, coordsB_reduceSel) LOGGER.info('Current RMSD is {:4.3f}\n'.format(rmsd)) self.numSteps += 1 self.rmsds.append(rmsd) if outputPDB: writePDB(filename + '_A', self.ensembleA) LOGGER.clear() writePDB(filename + '_B', self.ensembleB) LOGGER.clear() if outputDCD: writeDCD(filename + '_A', self.ensembleA) LOGGER.clear() writeDCD(filename + '_B', self.ensembleB) LOGGER.clear() return
def calcPerturbResponse(model, atoms=None, repeats=100): """Return a matrix of profiles from scanning of the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.t. :class:`.ANM` instance. Each residue/node is perturbed *repeats* times with a random unit force vector. When *atoms* instance is given, PRS profile for residues will be added as an attribute which then can be retrieved as ``atoms.getData('prs_profile')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The RPS matrix can be save as follows:: prs_matrix = calcPerturbationResponse(p38_anm) writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') """ if not isinstance(model, NMA): raise TypeError('model must be an NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') if atoms is not None: if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') assert isinstance(repeats, int), 'repeats must be an integer' cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') n_atoms = model.numAtoms() response_matrix = np.zeros((n_atoms, n_atoms)) LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs') i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 forces = np.random.rand(repeats * 3).reshape((repeats, 3)) forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) for force in forces: response_matrix[i] += ( np.dot(cov[:, i3:i3p3], force) ** 2).reshape((n_atoms, 3)).sum(1) LOGGER.update(i, '_prody_prs') response_matrix /= repeats LOGGER.clear() LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs') if atoms is not None: atoms.setData('prs_profile', response_matrix) return response_matrix # save the original PRS matrix np.savetxt('orig_PRS_matrix', response_matrix, delimiter='\t', fmt='%8.6f') # calculate the normalized PRS matrix self_dp = np.diag(response_matrix) # using self displacement (diagonal of # the original matrix) as a # normalization factor self_dp = self_dp.reshape(n_atoms, 1) norm_PRS_mat = response_matrix / np.repeat(self_dp, n_atoms, axis=1) # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_PRS_mat = norm_PRS_mat - np.diag(np.diag(norm_PRS_mat)) np.savetxt('norm_PRS_matrix', norm_PRS_mat, delimiter='\t', fmt='%8.6f') return response_matrix
def getRecord(self, url, localFile=False): if localFile: dali_file = open(url, 'r') data = dali_file.read() dali_file.close() else: sleep = 2 timeout = 120 LOGGER.timeit('_dali') log_message = '' try_error = 3 while True: LOGGER.sleep(int(sleep), 'to reconnect Dali '+log_message) LOGGER.clear() LOGGER.write('Connecting Dali for search results...') LOGGER.clear() try: html = urllib2.urlopen(url).read() except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: html = urllib2.urlopen(url).read() if html.find('Status: Queued') > -1: log_message = '(Dali searching is queued)...' elif html.find('Status: Running') > -1: log_message = '(Dali searching is running)...' elif html.find('Your job') == -1 and html.find('.txt') > -1: break elif html.find('ERROR:') > -1: LOGGER.warn(': Dali search reported an ERROR!') return None break sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) if LOGGER.timing('_dali') > timeout: LOGGER.warn(': Dali search is time out. \nThe results can be obtained using getRecord() function later.') return None break LOGGER.clear() LOGGER.clear() LOGGER.report('Dali results completed in %.1fs.', '_dali') lines = html.strip().split('\n') file_name = re.search('=.+-90\.txt', html).group()[1:] file_name = file_name[:-7] # LOGGER.info(url+file_name+self._subset+'.txt') data = urllib2.urlopen(url+file_name+self._subset+'.txt').read() temp_name = file_name+self._subset+'_dali.txt' with open(temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name + '\n' + data) # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data) data_list = data.strip().split('# ') # No: Chain Z rmsd lali nres %id PDB Description -> data_list[3] # Structural equivalences -> data_list[4] # Translation-rotation matrices -> data_list[5] map_temp_dict = dict() mapping = [] lines = data_list[4].strip().split('\n') self._lines_4 = lines mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4') # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b] for map_i in mapping_temp: if not map_i[0] in map_temp_dict: map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]] else: map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]]) self._max_index = max(mapping_temp[:,2]) self._mapping = map_temp_dict self._data = data_list[3] lines = data_list[3].strip().split('\n') daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('res_num', '<i4'), ('identity', '<i4'), ('title', '|S70')]) if daliInfo.ndim == 0: daliInfo = np.array([daliInfo]) pdbListAll = [] self._daliInfo = daliInfo dali_temp_dict = dict() for temp in self._daliInfo: temp_dict = dict() pdb_chain = temp[1].strip()[0:6] temp_dict['pdbId'] = pdb_chain[0:4] temp_dict['chainId'] = pdb_chain[5:6] temp_dict['pdb_chain'] = pdb_chain temp_dict['Z'] = temp[2] temp_dict['rmsd'] = temp[3] temp_dict['len_align'] = temp[4] temp_dict['res_num'] = temp[5] temp_dict['identity'] = temp[6] temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist() temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)] temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)] dali_temp_dict[temp_dict['pdb_chain']] = temp_dict pdbListAll.append(pdb_chain) self._pdbListAll = tuple(pdbListAll) self._pdbList = self._pdbListAll self._alignPDB = dali_temp_dict LOGGER.info(str(len(pdbListAll)) + ' Dali results have been searched.') return True
def buildCovariance(self, coordsets, **kwargs): """Build a covariance matrix for *coordsets* using mean coordinates as the reference. *coordsets* argument may be one of the following: * :class:`.Atomic` * :class:`.Ensemble` * :class:`.TrajBase` * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)`` For ensemble and trajectory objects, ``update_coords=True`` argument can be used to set the mean coordinates as the coordinates of the object. When *coordsets* is a trajectory object, such as :class:`.DCDFile`, covariance will be built by superposing frames onto the reference coordinate set (see :meth:`.Frame.superpose`). If frames are already aligned, use ``aligned=True`` argument to skip this step. .. note:: If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are treated specially. Let's say **C**\_ij is the element of the covariance matrix that corresponds to atoms *i* and *j*. This super element is divided by number of coordinate sets (PDB models or structures) in which both of these atoms are observed together.""" if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') LOGGER.timeit('_prody_pca') mean = None weights = None ensemble = None if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') elif isinstance(coordsets, Atomic): coordsets = coordsets._getCoordsets() elif isinstance(coordsets, Ensemble): ensemble = coordsets if isinstance(coordsets, PDBEnsemble): weights = coordsets.getWeights() > 0 coordsets = coordsets._getCoordsets() update_coords = bool(kwargs.get('update_coords', False)) if isinstance(coordsets, TrajBase): nfi = coordsets.nextIndex() coordsets.reset() n_atoms = coordsets.numSelected() dof = n_atoms * 3 cov = np.zeros((dof, dof)) #mean = coordsets._getCoords().flatten() n_confs = 0 n_frames = len(coordsets) LOGGER.info('Covariance will be calculated using {0} frames.' .format(n_frames)) coordsum = np.zeros(dof) LOGGER.progress('Building covariance', n_frames, '_prody_pca') align = not kwargs.get('aligned', False) for frame in coordsets: if align: frame.superpose() coords = frame._getCoords().flatten() coordsum += coords cov += np.outer(coords, coords) n_confs += 1 LOGGER.update(n_confs, '_prody_pca') LOGGER.clear() cov /= n_confs coordsum /= n_confs mean = coordsum cov -= np.outer(coordsum, coordsum) coordsets.goto(nfi) self._cov = cov if update_coords: coordsets.setCoords(mean.reshape((n_atoms, 3))) else: n_confs = coordsets.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate ' 'sets') n_atoms = coordsets.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 LOGGER.info('Covariance is calculated using {0} coordinate sets.' .format(len(coordsets))) s = (n_confs, dof) if weights is None: if coordsets.dtype == float: self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1) else: cov = np.zeros((dof, dof)) coordsets = coordsets.reshape((n_confs, dof)) mean = coordsets.mean(0) LOGGER.progress('Building covariance', n_confs, '_prody_pca') for i, coords in enumerate(coordsets.reshape(s)): deviations = coords - mean cov += np.outer(deviations, deviations) LOGGER.update(n_confs, '_prody_pca') LOGGER.clear() cov /= n_confs self._cov = cov else: # PDB ensemble case mean = np.zeros((n_atoms, 3)) for i, coords in enumerate(coordsets): mean += coords * weights[i] mean /= weights.sum(0) d_xyz = ((coordsets - mean) * weights).reshape(s) divide_by = weights.astype(float).repeat(3, axis=2).reshape(s) self._cov = np.dot(d_xyz.T, d_xyz) / np.dot(divide_by.T, divide_by) if update_coords and ensemble is not None: if mean is None: mean = coordsets.mean(0) ensemble.setCoords(mean) self._trace = self._cov.trace() self._dof = dof self._n_atoms = n_atoms LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
def calcPerturbResponse(model, atoms=None, repeats=100, **kwargs): """Returns a matrix of profiles from scanning of the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.g. :class:`.ANM` instance. Each residue/node is perturbed *repeats* times with a random unit force vector. When *atoms* instance is given, PRS profile for residues will be added as an attribute which then can be retrieved as ``atoms.getData('prs_profile')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The PRS matrix can be calculated and saved as follows:: prs_matrix = calcPerturbationResponse(p38_anm, saveMatrix=True) The PRS matrix can also be save later as follows:: writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') You can also control which operation is used for getting a single matrix from the repeated force application and whether to normalise the matrix at the end. If you do choose to normalise the matrix, you can still save the original matrix before normalisation as well. :arg operation: which operation to perform to get a single response matrix:: the mean, variance, max or min of the set of repeats. Another operation is to select elements from the matrix showing biggest difference from the square sum of the covariance matrix. The Default is the mean. To obtain all response matrices, set operation=None without quotes. You can also ask for 'all' operations or provide a list containing any set of them. :type operation: str or list :arg noForce: whether to use the covariance matrix directly rather than applying forces. This appears to be equivalent when scanning for response magnitudes and will be much quicker. Default is True. :type noForce: bool :arg normMatrix: whether to normalise the single response matrix by dividing each row by its diagonal, Default is False, we recommend true :type normMatrix: bool :arg saveMatrix: whether to save the last matrix generated to a text file. Default is False :type saveMatrix: bool :arg saveOrig: whether to save the original matrix despite normalisation. This is the same as saveMatrix when not normalizing. Default is False :type saveOrig: bool :arg baseSaveName: The central part of the file name for saved matrices, which you can set. This is surrounded by underscores. The beginning says orig or norm and the end says which operation was used. Default is 'response_matrix'. :type baseSaveName: str :arg acceptDirection: select reference direction for forces to be accepted. Can be 'in' (towards center of atoms), 'out' (away from center), or 'all'. Default is 'all'; Using other directions requires atoms. :type acceptDirection: str """ noForce = kwargs.get('noForce', True) if not noForce: operation = kwargs.get('operation', 'mea') if operation is not None: if type(operation) is str: if operation == 'all' or operation == 'all operations': operationList = ['var', 'mea', 'max', 'min', 'dif'] else: operationList = [] operationList.append(operation.lower()[:3]) elif type(operation) is list: operationList = operation for i in range(len(operationList)): operationList[i] = operationList[i].lower()[:3] operationList = np.array(operationList) found_valid_operation = False if 'var' in operationList: found_valid_operation = True if 'max' in operationList: found_valid_operation = True if 'mea' in operationList: found_valid_operation = True if 'min' in operationList: found_valid_operation = True if 'dif' in operationList: found_valid_operation = True if not found_valid_operation: raise ValueError('Operation should be mean, variance, max, min or ' \ 'or difference (from covariance matrix) in quotes ' \ 'or a list containing a set of these or None.') if not isinstance(model, NMA): raise TypeError('model must be an NMA instance') elif not model.is3d() and not noForce: raise TypeError('model must be a 3-dimensional NMA instance' \ 'for using PRS with force') elif len(model) == 0: raise ValueError('model must have normal modes calculated') if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() LOGGER.timeit('_prody_prs_all') LOGGER.info('Calculating covariance matrix') LOGGER.timeit('_prody_cov') assert isinstance(repeats, int), 'repeats must be an integer' cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') LOGGER.clear() LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov') LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs_mat') matrix_dict = {} if noForce or 'dif' in operationList: if not model.is3d(): n_by_n_cov_squared = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) n_by_n_cov_squared = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 n_by_n_cov_squared[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1) if noForce: matrix_dict['noForce'] = n_by_n_cov_squared LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') else: acceptDirection = kwargs.get('acceptDirection', 'all') if acceptDirection is not 'all': if atoms is None: acceptDirection = 'all' LOGGER.info('A specific direction for accepting forces was' \ ' provided without an atoms object. This' \ ' direction will be ignored and all forces will' \ ' be accepted.') else: coords = atoms.getCoords() atoms_center = array([np.mean(coords[:,0]), \ np.mean(coords[:,1]), \ np.mean(coords[:,2])]) mag = kwargs.get('mag', 1) response_matrix = np.zeros((repeats, n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 forces = np.random.randn(repeats * 3).reshape((repeats, 3)) forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) * mag for n in range(repeats): force = forces[n] if acceptDirection is 'in' or acceptDirection is 'out': res_coords = atoms.getCoords()[i] vec_to_center = atoms_center - res_coords vec_to_center /= (((atoms_center - res_coords)**2).sum()**0.5) force_overlap = np.dot(force, vec_to_center) if acceptDirection is 'in' and force_overlap < 0: force *= -1 if acceptDirection is 'out' and force_overlap > 0: force *= -1 response_matrix[n, i, :] = (np.dot(cov[:, i3:i3p3], force)**2).reshape( (n_atoms, 3)).sum(1) LOGGER.update(i, '_prody_prs_mat') LOGGER.clear() LOGGER.report( 'Perturbation response scanning matrix calculated in %.1fs.', '_prody_prs_mat') LOGGER.progress('Performing matrix combination operations', n_atoms, \ '_prody_prs_ops') if 'var' in operationList: matrix_dict['var'] = np.var(response_matrix, axis=0) if 'max' in operationList: matrix_dict['max'] = np.amax(response_matrix, axis=0) if 'mea' in operationList: matrix_dict['mea'] = np.mean(response_matrix, axis=0) if 'min' in operationList: matrix_dict['min'] = np.amin(response_matrix, axis=0) if 'dif' in operationList: matrix_dict['dif'] = np.max(abs(response_matrix - n_by_n_cov_squared) \ , axis=0) LOGGER.report( 'Perturbation response matrix operations completed in %.1fs.', '_prody_prs_ops') if operation is None: LOGGER.info('Operation is None so all {0} repeats are output.' \ ' This is not compatible with saving, normalizing' \ ' or mapping to atoms at present.'.format(repeats)) return response_matrix if atoms is not None: atoms.setData('prs_profile', matrix_dict[list(matrix_dict.keys())[0]]) if len(list(matrix_dict.keys())) > 1: LOGGER.info('Only one matrix can be added as data to atoms so' \ ' the first one was chosen. The operation that generated' \ ' it was {0} (1st 3 letters).'.format(list(matrix_dict.keys())[0])) saveOrig = kwargs.get('saveOrig', False) saveMatrix = kwargs.get('saveMatrix', False) normMatrix = kwargs.get('normMatrix', False) suppressDiag = kwargs.get('suppressDiag', False) baseSaveName = kwargs.get('baseSaveName', 'response_matrix') if saveOrig == True or saveMatrix == True and normMatrix == False: # save the original PRS matrix for each operation for m in list(matrix_dict.keys()): np.savetxt('orig_{0}_{1}.txt'.format(baseSaveName,m), \ matrix_dict[m], delimiter='\t', fmt='%8.6f') if normMatrix == True: norm_PRS_mat = {} # calculate the normalized PRS matrix for each operation for m in list(matrix_dict.keys()): self_dp = np.diag( matrix_dict[m]) # using self displacement (diagonal of # the original matrix) as a # normalization factor self_dp = self_dp.reshape(n_atoms, 1) norm_PRS_mat[m] = matrix_dict[m] / np.repeat( self_dp, n_atoms, axis=1) if suppressDiag == True: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_PRS_mat[m] = norm_PRS_mat[m] - np.diag( np.diag(norm_PRS_mat[m])) if saveMatrix == True: np.savetxt('norm_{0}_{1}.txt'.format(baseSaveName,m), \ norm_PRS_mat[m], delimiter='\t', fmt='%8.6f') LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs_all') matrix_list = [] for m in list(matrix_dict.keys()): if normMatrix == True: matrix_list.append(norm_PRS_mat[m]) else: matrix_list.append(matrix_dict[m]) matrix_array = array(matrix_list) returnFormat = kwargs.get('returnFormat', 'array') returnFormat = returnFormat.lower() if len(matrix_array) == 1: LOGGER.info('Output has been returned as a single matrix (an array).') return matrix_array.reshape(n_atoms, n_atoms) if returnFormat is 'both': LOGGER.info('You have requested return in both formats.' \ ' Array comes first.') return matrix_array, matrix_dict elif 'dict' in returnFormat: LOGGER.info('Output has been returned as a dictionary of matrices.') return matrix_dict else: LOGGER.info('Output has been returned as an array of matrices,' \ ' which you can split into individual matrices.') return matrix_array
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching *sequence* against the PDB using NCBI blastp. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is multiplied by 1.5 when results are not ready. *timeout* (default is 120 s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)
def calcPerturbResponse(model, **kwargs): """Returns a matrix of profiles from scanning the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.g. :class:`.ANM` instance. When an *atoms* instance is given, the PRS matrix will be added as data, which can be retrieved with ``atoms.getData('prs_matrix')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The PRS matrix can be calculated and saved as follows:: prs_matrix = calcPerturbResponse(p38_anm, saveMatrix=True) The PRS matrix can also be save later as follows:: writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') :arg saveMatrix: whether to save the last matrix generated to a text file. Default is False :type saveMatrix: bool :arg saveName: The file name for saved matrices Default is 'response_matrix.txt'. :type saveName: str """ if not isinstance(model, (NMA, ModeSet, Mode)): raise TypeError('model must be an NMA, ModeSet, or Mode instance') if isinstance(model, NMA) and len(model) == 0: raise ValueError('model must have normal modes calculated') atoms = kwargs.get('atoms', None) if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() LOGGER.timeit('_prody_prs_all') LOGGER.info('Calculating covariance matrix') LOGGER.timeit('_prody_cov') cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') LOGGER.clear() LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov') LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs_mat') if not model.is3d(): prs_matrix = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) prs_matrix = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1) LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') saveMatrix = kwargs.get('saveMatrix', False) suppressDiag = kwargs.get('suppressDiag', False) saveName = kwargs.get('saveName', 'response_matrix.txt') norm_prs_matrix = np.zeros((n_atoms, n_atoms)) self_dp = np.diag(prs_matrix) self_dp = self_dp.reshape(n_atoms, 1) norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1) if suppressDiag == True: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix)) if saveMatrix == True: np.savetxt(saveName, norm_prs_matrix, delimiter='\t', fmt='%8.6f') LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs_all') if atoms is not None: atoms.setData('prs_matrix', norm_prs_matrix) return atoms, norm_prs_matrix else: return norm_prs_matrix
def buildCovariance(self, coordsets, **kwargs): """Build a covariance matrix for *coordsets* using mean coordinates as the reference. *coordsets* argument may be one of the following: * :class:`.Atomic` * :class:`.Ensemble` * :class:`.TrajBase` * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)`` For ensemble and trajectory objects, ``update_coords=True`` argument can be used to set the mean coordinates as the coordinates of the object. When *coordsets* is a trajectory object, such as :class:`.DCDFile`, covariance will be built by superposing frames onto the reference coordinate set (see :meth:`.Frame.superpose`). If frames are already aligned, use ``aligned=True`` argument to skip this step. .. note:: If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are treated specially. Let's say **C**\_ij is the element of the covariance matrix that corresponds to atoms *i* and *j*. This super element is divided by number of coordinate sets (PDB models or structures) in which both of these atoms are observed together.""" if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') LOGGER.timeit('_prody_pca') mean = None weights = None ensemble = None if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') elif isinstance(coordsets, Atomic): coordsets = coordsets._getCoordsets() elif isinstance(coordsets, Ensemble): ensemble = coordsets if isinstance(coordsets, PDBEnsemble): weights = coordsets.getWeights() > 0 coordsets = coordsets._getCoordsets() update_coords = bool(kwargs.get('update_coords', False)) if isinstance(coordsets, TrajBase): nfi = coordsets.nextIndex() coordsets.reset() n_atoms = coordsets.numSelected() dof = n_atoms * 3 cov = np.zeros((dof, dof)) #mean = coordsets._getCoords().flatten() n_confs = 0 n_frames = len(coordsets) LOGGER.info( 'Covariance will be calculated using {0} frames.'.format( n_frames)) coordsum = np.zeros(dof) LOGGER.progress('Building covariance', n_frames, '_prody_pca') align = not kwargs.get('aligned', False) for frame in coordsets: if align: frame.superpose() coords = frame._getCoords().flatten() coordsum += coords cov += np.outer(coords, coords) n_confs += 1 LOGGER.update(n_confs, '_prody_pca') LOGGER.clear() cov /= n_confs coordsum /= n_confs mean = coordsum cov -= np.outer(coordsum, coordsum) coordsets.goto(nfi) self._cov = cov if update_coords: coordsets.setCoords(mean.reshape((n_atoms, 3))) else: n_confs = coordsets.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate ' 'sets') n_atoms = coordsets.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 LOGGER.info( 'Covariance is calculated using {0} coordinate sets.'.format( len(coordsets))) s = (n_confs, dof) if weights is None: if coordsets.dtype == float: self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1) else: cov = np.zeros((dof, dof)) coordsets = coordsets.reshape((n_confs, dof)) mean = coordsets.mean(0) LOGGER.progress('Building covariance', n_confs, '_prody_pca') for i, coords in enumerate(coordsets.reshape(s)): deviations = coords - mean cov += np.outer(deviations, deviations) LOGGER.update(n_confs, '_prody_pca') LOGGER.clear() cov /= n_confs self._cov = cov else: # PDB ensemble case mean = np.zeros((n_atoms, 3)) for i, coords in enumerate(coordsets): mean += coords * weights[i] mean /= weights.sum(0) d_xyz = ((coordsets - mean) * weights).reshape(s) divide_by = weights.astype(float).repeat(3, axis=2).reshape(s) self._cov = np.dot(d_xyz.T, d_xyz) / np.dot( divide_by.T, divide_by) if update_coords and ensemble is not None: if mean is None: mean = coordsets.mean(0) ensemble.setCoords(mean) self._trace = self._cov.trace() self._dof = dof self._n_atoms = n_atoms LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching *sequence* against the PDB using NCBI blastp. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is multiplied by 1.5 when results are not ready. *timeout* (default is 120 s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) else: raise TypeError( 'sequence must be Atomic, Sequence, or str not {0}'.format( type(sequence))) headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)
def calcPerturbResponse(model, atoms=None, **kwargs): """Returns a matrix of profiles from scanning the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.g. :class:`.ANM` instance. When an *atoms* instance is given, the PRS matrix will be added as data, which can be retrieved with ``atoms.getData('prs_matrix')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. """ if not isinstance(model, (NMA, ModeSet, Mode)): raise TypeError('model must be an NMA, ModeSet, or Mode instance') if isinstance(model, NMA) and len(model) == 0: raise ValueError('model must have normal modes calculated') atoms = kwargs.get('atoms', None) if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() LOGGER.timeit('_prody_prs_all') LOGGER.info('Calculating covariance matrix') LOGGER.timeit('_prody_cov') cov = model.getCovariance() LOGGER.clear() LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov') LOGGER.info('Calculating perturbation response') LOGGER.timeit('_prody_prs_mat') if not model.is3d(): prs_matrix = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) prs_matrix = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1) LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') no_diag = kwargs.get('no_diag', False) #filename = kwargs.get('filename', None) norm_prs_matrix = np.zeros((n_atoms, n_atoms)) self_dp = np.diag(prs_matrix) self_dp = self_dp.reshape(n_atoms, 1) norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1) effectiveness = np.mean(norm_prs_matrix, axis=1) sensitivity = np.mean(norm_prs_matrix, axis=0) if no_diag: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix)) #if filename: # np.savetxt(filename, norm_prs_matrix, delimiter='\t', fmt='%8.6f') LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs_all') if atoms is not None: try: ag = atoms.getAtomGroup() defdata = np.zeros(ag.numAtoms(), dtype=float) ag.setData('effectiveness', defdata.copy()) ag.setData('sensitivity', defdata.copy()) except AttributeError: pass atoms.setData('effectiveness', effectiveness) atoms.setData('sensitivity', sensitivity) #atoms.setData('prs_matrix', norm_prs_matrix) return norm_prs_matrix, effectiveness, sensitivity
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0:s} is not a valid type for trajectory' .format(type(trajectory))) irange = range(*slice(start, stop, step).indices(trajectory.numCoordsets())) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev if diff > 1: trajectory.skip(diff-1) prev = i if isTrajectory: frame = trajectory.next() if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE/90) * (90-uc[3:])) uc = uc[[0,3,1,4,5,2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, '_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.clear() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4 ) * n_csets / (1024*1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.' .format(dcd_size, dcd_size/time_)) LOGGER.info('{0:d} coordinate sets written at output rate {1:d} frame/s.' .format(n_csets, int(n_csets/time_))) if j != n_csets: LOGGER.warn('Warning: {0:d} frames expected, {1:d} written.' .format(n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename
def calcPerturbResponse(model, atoms=None, repeats=100): """Returns a matrix of profiles from scanning of the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.t. :class:`.ANM` instance. Each residue/node is perturbed *repeats* times with a random unit force vector. When *atoms* instance is given, PRS profile for residues will be added as an attribute which then can be retrieved as ``atoms.getData('prs_profile')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The PRS matrix can be saved as follows:: prs_matrix = calcPerturbationResponse(p38_anm) writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') """ if not isinstance(model, NMA): raise TypeError('model must be an NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') if atoms is not None: if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') assert isinstance(repeats, int), 'repeats must be an integer' cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') n_atoms = model.numAtoms() response_matrix = np.zeros((n_atoms, n_atoms)) LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs') i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 forces = np.random.rand(repeats * 3).reshape((repeats, 3)) forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) for force in forces: response_matrix[i] += ( np.dot(cov[:, i3:i3p3], force) ** 2).reshape((n_atoms, 3)).sum(1) LOGGER.update(i, '_prody_prs') response_matrix /= repeats LOGGER.clear() LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs') if atoms is not None: atoms.setData('prs_profile', response_matrix) return response_matrix # save the original PRS matrix np.savetxt('orig_PRS_matrix', response_matrix, delimiter='\t', fmt='%8.6f') # calculate the normalized PRS matrix self_dp = np.diag(response_matrix) # using self displacement (diagonal of # the original matrix) as a # normalization factor self_dp = self_dp.reshape(n_atoms, 1) norm_PRS_mat = response_matrix / np.repeat(self_dp, n_atoms, axis=1) # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_PRS_mat = norm_PRS_mat - np.diag(np.diag(norm_PRS_mat)) np.savetxt('norm_PRS_matrix', norm_PRS_mat, delimiter='\t', fmt='%8.6f') return response_matrix
def blastPDB(sequence, filename=None, **kwargs): """Return a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 30 seconds) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, str): sequence = ''.join(sequence.split()) if not checkSequence(sequence): raise ValueError(repr(sequence) + ' is not a valid sequence') else: raise TypeError('sequence must be a string') query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = kwargs.pop('expect', 10e-10) assert isinstance(expect, (float, int)), 'expect must be a float' assert expect > 0, 'expect must be a positive number' query.append(('EXPECT', expect)) hitlist_size = kwargs.pop('hitlist_size', 250) assert isinstance(hitlist_size, int), 'hitlist_size must be an integer' assert hitlist_size > 0, 'expect must be a positive integer' query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 20)) if kwargs: LOGGER.warning("Keyword argument(s) '{0:s}' are not used." .format("', '".join(kwargs.keys()))) import urllib, urllib2 url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urllib.urlencode(query) LOGGER.timeit() LOGGER.info('Blast searching NCBI PDB database for "{0:s}..."' .format(sequence[:5])) request = urllib2.Request(url, data, {'User-agent': 'ProDy'}) handle = urllib2.urlopen(request) html = handle.read() index = html.find('RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find('\n', index) rid = html[index + len('RID ='):last].strip() index = html.find('RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find('\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urllib.urlencode(query) while True: LOGGER.sleep(int(sleep), ' to connect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') request = urllib2.Request(url, data, {'User-agent': 'ProDy'}) handle = urllib2.urlopen(request) results = handle.read() index = results.find('Status=') LOGGER.clear() if index < 0: break last = results.index('\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep *= 2 if LOGGER.timing() > timeout: LOGGER.warning('Blast search time out.') return None LOGGER.clear() LOGGER.timing('Blast search completed in %.1fs.') if isinstance(filename, str): if not filename.lower().endswith('.xml'): filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0:s}.'.format(filename)) return PDBBlastRecord(results, sequence)
def searchEmsurfer(emd, **kwargs): """Search with the EM-Surfer server with input of EMD ID (or local EMD file). EM-Surfer server: http://kiharalab.org/em-surfer/ :arg emd: EMD code or local EMD map file for the query protein """ import requests from requests.models import Request LOGGER.timeit('_emsurfer') # timeout = 120 timeout = kwargs.pop('timeout', 120) emsurferURL = "http://kiharalab.org/em-surfer/cgi-bin/listResults.cgi" volumeFilter = kwargs.get('volumeFilter', 'on') representation = kwargs.get('representation','recommend') minResolution = kwargs.get('minResolution', 0.5) maxResolution = kwargs.get('maxResolution', 30.) if isinstance(emd, EMDMAP): emdmap = emd stream = createStringIO() writeEMD(stream, emdmap) data = stream.getvalue() stream.close() files = {"file1" : data} emdId = emdmap.getTitle() emdId = '' emsurfer_title = 'Title_'+emdId elif isinstance(emd, str): if os.path.isfile(emd): emdmap = parseEMD(emd) filename = os.path.basename(emd) filename, ext = os.path.splitext(filename) if ext.lower() == '.gz': filename2, ext2 = os.path.splitext(filename) if ext2.lower() == '.emd': filename = filename2 emdId = filename files = {"file1" : data} emdId = '' emsurfer_title = 'Title_' + emdId else: emdId = emd emsurfer_title = 'Title_' + emdId files = '' method='post' url=emsurferURL params = { 'emdbid' : emdId, 'volumefilter' : volumeFilter, 'representation' : representation, 'minresolution': minResolution, 'maxresolution': maxResolution } # Generate request url deep inside data=None; headers=None; cookies=None; files=None auth=None; timeout=None; allow_redirects=True; proxies=None hooks=None; stream=None; verify=None; cert=None; json=None req = Request( method=method.upper(), url=url, headers=headers, files=files, data=data or {}, json=json, params=params or {}, auth=auth, cookies=cookies, hooks=hooks, ) session = requests.sessions.Session() prep = session.prepare_request(req) resp = session.send(prep) url = resp.url LOGGER.debug('Submitted Emsurfer search for EMD "{0}".'.format(emdId)) LOGGER.info(url) LOGGER.clear() obj = EmsurferRecord(url, emdId, timeout=timeout, **kwargs) return obj
def blastPDBUniProtKB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. *num_sequences (default is ``1``) """ num_sequences = int(kwargs.pop('num_sequences', 1)) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: if num_sequences == 1: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-5)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') psiblast = 'true' step_number = 3 query.append(('RUN_PSIBLAST', psiblast)) query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) query.append(('STEP_NUMBER', step_number)) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib.parse import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'name="RID" type="hidden" value="') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'>',index) rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip() index = html.find(b'name="RTOE" type="hidden" value="') if index == -1: rtoe = None # This is not used else: last = html.find(b'>', index) rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return SwissProtBlastRecord(results, sequence)
def calcPerturbResponse(model, atoms=None, **kwargs): """Returns a matrix of profiles from scanning the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.g. :class:`.ANM` instance. When an *atoms* instance is given, the PRS matrix will be added as data, which can be retrieved with ``atoms.getData('prs_matrix')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. """ if not isinstance(model, (NMA, ModeSet, Mode)): raise TypeError('model must be an NMA, ModeSet, or Mode instance') if isinstance(model, NMA) and len(model) == 0: raise ValueError('model must have normal modes calculated') atoms = kwargs.get('atoms',None) if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() LOGGER.timeit('_prody_prs_all') LOGGER.info('Calculating covariance matrix') LOGGER.timeit('_prody_cov') cov = model.getCovariance() LOGGER.clear() LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov') LOGGER.info('Calculating perturbation response') LOGGER.timeit('_prody_prs_mat') if not model.is3d(): prs_matrix = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) prs_matrix = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i,:] = (cov_squared[i3:i3p3,:]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 prs_matrix[:,j] = (n_by_3n_cov_squared[:,j3:j3p3]).sum(1) LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') no_diag = kwargs.get('no_diag', True) #filename = kwargs.get('filename', None) norm_prs_matrix = np.zeros((n_atoms, n_atoms)) self_dp = np.diag(prs_matrix) self_dp = self_dp.reshape(n_atoms, 1) norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1) if no_diag: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix)) W = 1 - np.eye(n_atoms) effectiveness = np.average(norm_prs_matrix, weights=W, axis=1) sensitivity = np.average(norm_prs_matrix, weights=W, axis=0) #if filename: # np.savetxt(filename, norm_prs_matrix, delimiter='\t', fmt='%8.6f') LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs_all') if atoms is not None: try: ag = atoms.getAtomGroup() defdata = np.zeros(ag.numAtoms(), dtype=float) ag.setData('effectiveness', defdata.copy()) ag.setData('sensitivity', defdata.copy()) except AttributeError: pass atoms.setData('effectiveness', effectiveness) atoms.setData('sensitivity', sensitivity) #atoms.setData('prs_matrix', norm_prs_matrix) return norm_prs_matrix, effectiveness, sensitivity
def calcPerturbResponse(model, **kwargs): """This function implements the perturbation response scanning (PRS) method described in [CA09]_ and [IG14]_. It returns a PRS matrix, and effectiveness and sensitivity profiles. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_matrix[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from a *model*, e.g. a :class:`.ANM` instance. To use an external matrix, please provide it to a :class:`.PCA` instance using the :meth:`.PCA.setCovariance`. When an *atoms* instance is given, the PRS matrix will be added as data, which can be retrieved with ``atoms.getData('prs_matrix')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. .. [IG14] General IJ, Liu Y, Blackburn ME, Mao W, Gierasch LM, Bahar I. ATPase subdomain IA is a mediator of interdomain allostery in Hsp70 molecular chaperones. *PLoS Comput. Biol.* **2014** 10:e1003624. If *turbo* is **True** (default), then PRS is approximated by the limit of large numbers of forces and no perturbation forces are explicitly applied. If set to **False**, then each residue/node is perturbed *repeats* times (default 100) with a random unit force vector as in ProDy v1.8 and earlier. """ if not isinstance(model, (NMA, ModeSet, Mode)): raise TypeError('model must be an NMA, ModeSet, or Mode instance') if isinstance(model, NMA) and len(model) == 0: raise ValueError('model must have normal modes calculated') atoms = kwargs.get('atoms', None) suppress_diag = kwargs.get('suppress_diag', False) no_diag = kwargs.get('no_diag', suppress_diag) if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() # LOGGER.timeit('_prody_prs_all') # LOGGER.info('Calculating covariance matrix') # LOGGER.timeit('_prody_cov') cov = model.getCovariance() turbo = kwargs.get('turbo', True) if turbo: if not model.is3d(): prs_matrix = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) prs_matrix = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1) else: repeats = kwargs.pop('repeats', 100) LOGGER.info( 'Calculating perturbation response with {0} repeats'.format( repeats)) LOGGER.timeit('_prody_prs_mat') response_matrix = np.zeros((n_atoms, n_atoms)) LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs') i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 forces = np.random.rand(repeats * 3).reshape((repeats, 3)) forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) for force in forces: response_matrix[i] += (np.dot(cov[:, i3:i3p3], force)**2).reshape( (n_atoms, 3)).sum(1) LOGGER.update(i, '_prody_prs') response_matrix /= repeats LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') norm_prs_matrix = np.zeros((n_atoms, n_atoms)) self_dp = np.diag(prs_matrix) self_dp = self_dp.reshape(n_atoms, 1) re_self_dp = np.repeat(self_dp, n_atoms, axis=1) norm_prs_matrix = div0(prs_matrix, re_self_dp) if no_diag: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix)) W = 1 - np.eye(n_atoms) effectiveness = np.average(norm_prs_matrix, weights=W, axis=1) sensitivity = np.average(norm_prs_matrix, weights=W, axis=0) # LOGGER.report('Perturbation response scanning completed in %.1fs.', # '_prody_prs_all') if atoms is not None: try: ag = atoms.getAtomGroup() defdata = np.zeros(ag.numAtoms(), dtype=float) ag.setData('effectiveness', defdata.copy()) ag.setData('sensitivity', defdata.copy()) except AttributeError: pass atoms.setData('effectiveness', effectiveness) atoms.setData('sensitivity', sensitivity) #atoms.setData('prs_matrix', norm_prs_matrix) return norm_prs_matrix, effectiveness, sensitivity
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0} is not a valid type for trajectory'.format( type(trajectory))) irange = list( range(*slice(start, stop, step).indices(trajectory.numCoordsets()))) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev if diff > 1: trajectory.skip(diff - 1) prev = i if isTrajectory: frame = next(trajectory) if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:])) uc = uc[[0, 3, 1, 4, 5, 2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, '_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.clear() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format( dcd_size, dcd_size / time_)) LOGGER.info( '{0} coordinate sets written at output rate {1} frame/s.'.format( n_csets, int(n_csets / time_))) if j != n_csets: LOGGER.warn('Warning: {0} frames expected, {1} written.'.format( n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle',0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email','*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2: raise ValueError('email must be a valid email address with at least one . and exactly one @ sign') elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError('email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid','') if previousjobid is not '': query.append(('previousjobid',previousjobid)) selectedHits = kwargs.get('selectedHits','') if selectedHits is not '': query.append(('selectedHits',selectedHits)) database = kwargs.get('database','pdb') checkPsiBlastParameter('database', database) query.append(('database',database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix',matrix)) gapopen = kwargs.get('gapopen',11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen',gapopen)) gapext = kwargs.get('gapext',1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext',gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr',expthr)) psithr = kwargs.get('psithr',1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr',psithr)) scores = kwargs.get('scores',500) checkPsiBlastParameter('scores', scores) query.append(('scores',scores)) alignments = kwargs.get('alignments',500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments',alignments)) query.append(('alignView',0)) dropoff = kwargs.get('dropoff',15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff',dropoff)) finaldropoff = kwargs.get('finaldropoff',25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff',finaldropoff)) filter = kwargs.get('filter','F') checkPsiBlastParameter('filter', filter) query.append(('filter',filter)) if previousjobid is '' and selectedHits is '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError('seqrange should be START-END with START and END being integers') query.append(('seqrange',seqrange)) headers = { 'User-Agent' : 'ProDy' } try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."' .format(sequence[:5])) else: LOGGER.info('PSI-Blast searching PDB database, cycle={0}' .format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def fetch(self, xml=None, sequence=None, **kwargs): """Get Blast record from url or file. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg xml: blast search results in XML format or an XML file that contains the results or a filename for saving the results or None :type xml: str :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int """ if self.isSuccess: LOGGER.warn( "The record already exists so not further search is performed") return True if sequence == None: sequence = self._sequence if xml == None: xml = self._xml import xml.etree.cElementTree as ET if xml is not None and len(xml) < 100: if os.path.isfile(xml): xml = ET.parse(xml) root = xml.getroot() else: raise ValueError('xml is not a filename and does not look like' ' a valid XML string') else: headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', self._timeout)) self._timeout = timeout try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info( 'Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect to NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == b'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return False LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') filename = xml root = ET.XML(results) try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) root = dictElement(root, 'BlastOutput_') if root['db'] != 'pdb': raise ValueError('blast search database in xml must be "pdb"') if root['program'] != 'blastp': raise ValueError( 'blast search program in xml must be "blastp"') self._param = dictElement(root['param'][0], 'Parameters_') query_len = int(root['query-len']) if sequence and len(sequence) != query_len: raise ValueError( 'query-len and the length of the sequence do not ' 'match, xml data may not be for given sequence') hits = [] for iteration in root['iterations']: for hit in dictElement(iteration, 'Iteration_')['hits']: hit = dictElement(hit, 'Hit_') data = dictElement(hit['hsps'][0], 'Hsp_') for key in [ 'align-len', 'gaps', 'hit-frame', 'hit-from', 'hit-to', 'identity', 'positive', 'query-frame', 'query-from', 'query-to' ]: data[key] = int(data[key]) data['query-len'] = query_len for key in ['evalue', 'bit-score', 'score']: data[key] = float(data[key]) p_identity = 100.0 * data['identity'] / ( data['query-to'] - data['query-from'] + 1) data['percent_identity'] = p_identity p_overlap = (100.0 * (data['align-len'] - data['gaps']) / query_len) data['percent_coverage'] = p_overlap for item in (hit['id'] + hit['def']).split('>gi'): head, title = item.split(None, 1) head = head.split('|') pdb_id = head[-2].lower() chain_id = head[-1][:1] pdbch = dict(data) pdbch['pdb_id'] = pdb_id pdbch['chain_id'] = chain_id pdbch['title'] = (head[-1][1:] + title).strip() hits.append((p_identity, p_overlap, pdbch)) hits.sort(key=lambda hit: hit[0], reverse=True) self._hits = hits return True
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle', 0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError( 'sequence must be Atomic, Sequence, or str not {0}'.format( type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email', '*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len( email.split('@')) != 2: raise ValueError( 'email must be a valid email address with at least one . and exactly one @ sign' ) elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError( 'email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid', '') if previousjobid != '': query.append(('previousjobid', previousjobid)) selectedHits = kwargs.get('selectedHits', '') if selectedHits != '': query.append(('selectedHits', selectedHits)) database = kwargs.get('database', 'pdb') checkPsiBlastParameter('database', database) query.append(('database', database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix', matrix)) gapopen = kwargs.get('gapopen', 11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen', gapopen)) gapext = kwargs.get('gapext', 1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext', gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr', expthr)) psithr = kwargs.get('psithr', 1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr', psithr)) scores = kwargs.get('scores', 500) checkPsiBlastParameter('scores', scores) query.append(('scores', scores)) alignments = kwargs.get('alignments', 500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments', alignments)) query.append(('alignView', 0)) dropoff = kwargs.get('dropoff', 15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff', dropoff)) finaldropoff = kwargs.get('finaldropoff', 25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff', finaldropoff)) filter = kwargs.get('filter', 'no') checkPsiBlastParameter('filter', filter) query.append(('filter', filter)) if previousjobid == '' and selectedHits == '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError( 'seqrange should be START-END with START and END being integers' ) query.append(('seqrange', seqrange)) headers = {'User-Agent': 'ProDy'} try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."'.format( sequence[:5])) else: LOGGER.info( 'PSI-Blast searching PDB database, cycle={0}'.format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() if PY3K: job_id = job_id.decode() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() if PY3K: status = status.decode() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() if PY3K: status = status.decode() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. """ if sequence == "runexample": sequence = ( "ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI" "SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN" "DAYDIVKMKKSNISPNFNFMGQLLDFERTL" ) else: try: sequence = "".join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError("sequence must be a string") else: if not _: raise ValueError("not a valid protein sequence") headers = {"User-agent": "ProDy"} query = [("DATABASE", "pdb"), ("ENTREZ_QUERY", "(none)"), ("PROGRAM", "blastp")] expect = float(kwargs.pop("expect", 10e-10)) if expect <= 0: raise ValueError("expect must be a positive number") query.append(("EXPECT", expect)) hitlist_size = int(kwargs.pop("hitlist_size", 250)) if hitlist_size <= 0: raise ValueError("expect must be a positive integer") query.append(("HITLIST_SIZE", hitlist_size)) query.append(("QUERY", sequence)) query.append(("CMD", "Put")) sleep = float(kwargs.pop("sleep", 2)) timeout = float(kwargs.pop("timeout", 120)) if kwargs: LOGGER.warn("Keyword argument(s) {0} are not used.".format(", ".join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), "utf-8") except ImportError: from urllib import urlencode url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" data = urlencode(query) LOGGER.timeit("_prody_blast") LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b"RID =") if index == -1: raise Exception("NCBI did not return expected response.") else: last = html.find(b"\n", index) rid = html[index + len("RID =") : last].strip() index = html.find(b"RTOE =") if index == -1: rtoe = None # This is not used else: last = html.find(b"\n", index) rtoe = int(html[index + len("RTOE =") : last].strip()) query = [("ALIGNMENTS", 500), ("DESCRIPTIONS", 500), ("FORMAT_TYPE", "XML"), ("RID", rid), ("CMD", "Get")] data = urlencode(query) while True: LOGGER.sleep(int(sleep), "to reconnect NCBI for search results.") LOGGER.write("Connecting NCBI for search results...") handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b"Status=") LOGGER.clear() if index < 0: break last = results.index(b"\n", index) status = results[index + len("Status=") : last].strip() if status.upper() == "READY": break sleep = int(sleep * 1.5) if LOGGER.timing("_prody_blast") > timeout: LOGGER.warn("Blast search time out.") return None LOGGER.clear() LOGGER.report("Blast search completed in %.1fs.", "_prody_blast") try: ext_xml = filename.lower().endswith(".xml") except AttributeError: pass else: if not ext_xml: filename += ".xml" out = open(filename, "w") out.write(results) out.close() LOGGER.info("Results are saved as {0}.".format(repr(filename))) return PDBBlastRecord(results, sequence)