def queryPolyPhen2(filename, dump=True, prefix='pph2', fasta_file=None, fix_isoforms=False, ignore_errors=False, **kwargs): # original PolyPhen-2 curl command (see: # http://genetics.bwh.harvard.edu/pph2/dokuwiki/faq ): # # curl -F _ggi_project=PPHWeb2 -F _ggi_origin=query \ # -F _ggi_target_pipeline=1 -F MODELNAME=HumDiv \ # -F UCSCDB=hg19 -F SNPFUNC=m -F [email protected] \ # -F _ggi_batch_file=@example_batch.txt \ # -D - http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi assert type(dump) is bool assert type(prefix) is str LOGGER.info('Submitting query to PolyPhen-2...') num_lines = sum(1 for line in open(filename, 'rb') if line[0] != '#') input_file = open(filename, 'rb') # submit query address = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi' files = { '_ggi_project': (None, 'PPHWeb2'), '_ggi_origin': (None, 'query'), '_ggi_target_pipeline': (None, '1'), '_ggi_batch_file': ('query.txt', input_file), 'MODELNAME': (None, kwargs.get('MODELNAME', 'HumDiv')), 'UCSCDB': (None, kwargs.get('UCSCDB', 'hg19')), 'SNPFUNC': (None, kwargs.get('SNPFUNC', 'm')) } if fasta_file is not None: # upload custom sequences custom_fasta = open(fasta_file, 'rb') files['uploaded_sequences_1'] = ('sequences.fa', custom_fasta) response = requests.post(address, files=files) # parse job ID from response page jobID = response.cookies['polyphenweb2'] # results and semaphore files results_dir = f'http://genetics.bwh.harvard.edu/ggi/pph2/{jobID}/1/' files = { 'started': results_dir + 'started.txt', 'completed': results_dir + 'completed.txt', 'short': results_dir + 'pph2-short.txt', 'full': results_dir + 'pph2-full.txt', 'log': results_dir + 'pph2-log.txt', 'snps': results_dir + 'pph2-snps.txt' } # keep checking if the job has started/completed and, # when done, fetch output files output = {} exts = ['started', 'completed', 'short', 'full', 'log', 'snps'] for k in exts: # delay = timeout + backoff_factor*[2^(total_retries - 1)] if k == 'started': LOGGER.timeit('_started') r = _requests_retry_session(retries=16).get(files[k]) LOGGER.report('Query to PolyPhen-2 started in %.1fs.', '_started') LOGGER.info('PolyPhen-2 is running...') elif k == 'completed': LOGGER.timeit('_queryPP2') r = _requests_retry_session(retries=200, timeout=log(num_lines) / 2).get( files[k]) LOGGER.report('Query to PolyPhen-2 completed in %.1fs.', '_queryPP2') else: r = _requests_retry_session(retries=12).get(files[k]) output[k] = r.text # print to file, if requested if dump: with open(prefix + '-' + k + '.txt', 'w', 1) as f: print(r.text, file=f) # check for conflicts between Uniprot sequences and isoforms used # by Polyhen-2 (which are sometimes outdated) Uniprot_accs = _check_log_errors(output['log']) if Uniprot_accs: if fix_isoforms: LOGGER.info('PolyPhen-2 may have picked the wrong isoforms.') LOGGER.info('Resubmitting query with correct isoforms --- ' 'it may take up to a few hours to complete...') # print file with freshly downloaded Uniprot sequences fasta_fname, new_accs = _print_fasta_file(Uniprot_accs) # replace accession numbers in list of SAVs tmp_fname = filename + '.tmp' _replace_strings_in_file(filename, tmp_fname, new_accs) # resubmit query by manually uploading fasta sequences output = queryPolyPhen2(tmp_fname, dump=dump, prefix=prefix, fasta_file=fasta_fname, fix_isoforms=False, **kwargs) os.remove(tmp_fname) # restore original accession numbers in output orig_accs = dict([[v, k] for k, v in new_accs.items()]) for k in exts: output[k] = _replace_strings_in_text(output[k], orig_accs) if dump: outfile = f'pph2-{k}.txt' _replace_strings_in_file(outfile, outfile, orig_accs) elif not ignore_errors: LOGGER.warn('Please check PolyPhen-2 log file') else: LOGGER.error('Please check PolyPhen-2 log file') return output
def alignCustomPDB(self, PDB, chain='all', title=None, recover=False): """Aligns the Uniprot sequence with the sequence from the given PDB. """ assert isinstance(PDB, (str, pd.Atomic)), \ 'PDB must be a PDBID or an Atomic instance (e.g. AtomGroup).' assert isinstance(chain, str) or all( isinstance(s, str) for s in chain), \ "'chain' must be a string or a list of strings." assert isinstance(title, str) or title is None # parse/import pdb and assign title if isinstance(PDB, str): try: pdb = pd.parsePDB(PDB, subset='calpha') except Exception as e: msg = ('Unable to import PDB: PDB ID might be invalid or ' f'PDB file might be corrupted. Error message: {e}') LOGGER.error(msg) if title is None: title = os.path.basename(PDB.strip()) title = title.replace(' ', '_') else: pdb = PDB.ca if title is None: title = PDB.getTitle() # check if a record is already present rec = [d for d in self.customPDBmappings if d['PDB'] == title] if recover and len(rec) > 1: raise RuntimeError('Multiple records found with same ID.') elif recover and len(rec) == 1: customPDBrecord = rec[0] else: # create record for custom PDB customPDBrecord = { 'PDB': title, 'chain_res': {}, 'chain_seq': {}, 'warnings': [] } self.customPDBmappings.append(customPDBrecord) # check given chain list all_chains = set(pdb.getChids()) if chain == 'all' or chain == '@': chains_to_align = list(all_chains) elif type(chain) is list: chains_to_align = chain else: chains_to_align = [ chain, ] invalid_chIDs = [c for c in chains_to_align if c not in all_chains] if invalid_chIDs != []: raise ValueError('Invalid chain: {}.'.format(invalid_chIDs)) # store resids and sequence of selected chains for c in chains_to_align: if c in customPDBrecord['chain_res']: continue customPDBrecord['chain_res'][c] = pdb[c].getResnums() customPDBrecord['chain_seq'][c] = pdb[c].getSequence() # align selected chains with BioPython module pairwise2 self._calcCustomAlignments(title, chains_to_align) return customPDBrecord