def download_family(name: str) -> str: return prody.fetchPfamMSA(name, format='fasta', outname=f'fs/{name}.fasta', timeout=10)
def evol_fetch(acc, **kwargs): import prody prody.fetchPfamMSA(acc, **kwargs)
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError( 'No Pfam domain for resid {}.'.format(resid)) if len(PF_list) > 1: LOGGER.warn('Residue {} is found in multiple '.format(resid) + \ '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder', './') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # os.rename(f, fullname) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = fetchPfamMSA(PF) msa = parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = calcShannonEntropy(ref_msa) d['MutInfo'] = buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}