示例#1
0
def queryPolyPhen2(filename,
                   dump=True,
                   prefix='pph2',
                   fasta_file=None,
                   fix_isoforms=False,
                   ignore_errors=False,
                   **kwargs):
    # original PolyPhen-2 curl command (see:
    # http://genetics.bwh.harvard.edu/pph2/dokuwiki/faq ):
    #
    # curl  -F _ggi_project=PPHWeb2  -F _ggi_origin=query         \
    # -F _ggi_target_pipeline=1  -F MODELNAME=HumDiv              \
    # -F UCSCDB=hg19  -F SNPFUNC=m  -F [email protected] \
    # -F _ggi_batch_file=@example_batch.txt                       \
    # -D - http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi

    assert type(dump) is bool
    assert type(prefix) is str

    LOGGER.info('Submitting query to PolyPhen-2...')
    num_lines = sum(1 for line in open(filename, 'rb') if line[0] != '#')
    input_file = open(filename, 'rb')
    # submit query
    address = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi'
    files = {
        '_ggi_project': (None, 'PPHWeb2'),
        '_ggi_origin': (None, 'query'),
        '_ggi_target_pipeline': (None, '1'),
        '_ggi_batch_file': ('query.txt', input_file),
        'MODELNAME': (None, kwargs.get('MODELNAME', 'HumDiv')),
        'UCSCDB': (None, kwargs.get('UCSCDB', 'hg19')),
        'SNPFUNC': (None, kwargs.get('SNPFUNC', 'm'))
    }
    if fasta_file is not None:
        # upload custom sequences
        custom_fasta = open(fasta_file, 'rb')
        files['uploaded_sequences_1'] = ('sequences.fa', custom_fasta)
    response = requests.post(address, files=files)
    # parse job ID from response page
    jobID = response.cookies['polyphenweb2']
    # results and semaphore files
    results_dir = f'http://genetics.bwh.harvard.edu/ggi/pph2/{jobID}/1/'
    files = {
        'started': results_dir + 'started.txt',
        'completed': results_dir + 'completed.txt',
        'short': results_dir + 'pph2-short.txt',
        'full': results_dir + 'pph2-full.txt',
        'log': results_dir + 'pph2-log.txt',
        'snps': results_dir + 'pph2-snps.txt'
    }
    # keep checking if the job has started/completed and,
    # when done, fetch output files
    output = {}
    exts = ['started', 'completed', 'short', 'full', 'log', 'snps']
    for k in exts:
        # delay = timeout + backoff_factor*[2^(total_retries - 1)]
        if k == 'started':
            LOGGER.timeit('_started')
            r = _requests_retry_session(retries=16).get(files[k])
            LOGGER.report('Query to PolyPhen-2 started in %.1fs.', '_started')
            LOGGER.info('PolyPhen-2 is running...')
        elif k == 'completed':
            LOGGER.timeit('_queryPP2')
            r = _requests_retry_session(retries=200,
                                        timeout=log(num_lines) / 2).get(
                                            files[k])
            LOGGER.report('Query to PolyPhen-2 completed in %.1fs.',
                          '_queryPP2')
        else:
            r = _requests_retry_session(retries=12).get(files[k])
        output[k] = r.text
        # print to file, if requested
        if dump:
            with open(prefix + '-' + k + '.txt', 'w', 1) as f:
                print(r.text, file=f)

    # check for conflicts between Uniprot sequences and isoforms used
    # by Polyhen-2 (which are sometimes outdated)
    Uniprot_accs = _check_log_errors(output['log'])
    if Uniprot_accs:
        if fix_isoforms:
            LOGGER.info('PolyPhen-2 may have picked the wrong isoforms.')
            LOGGER.info('Resubmitting query with correct isoforms --- '
                        'it may take up to a few hours to complete...')
            # print file with freshly downloaded Uniprot sequences
            fasta_fname, new_accs = _print_fasta_file(Uniprot_accs)
            # replace accession numbers in list of SAVs
            tmp_fname = filename + '.tmp'
            _replace_strings_in_file(filename, tmp_fname, new_accs)
            # resubmit query by manually uploading fasta sequences
            output = queryPolyPhen2(tmp_fname,
                                    dump=dump,
                                    prefix=prefix,
                                    fasta_file=fasta_fname,
                                    fix_isoforms=False,
                                    **kwargs)
            os.remove(tmp_fname)
            # restore original accession numbers in output
            orig_accs = dict([[v, k] for k, v in new_accs.items()])
            for k in exts:
                output[k] = _replace_strings_in_text(output[k], orig_accs)
                if dump:
                    outfile = f'pph2-{k}.txt'
                    _replace_strings_in_file(outfile, outfile, orig_accs)
        elif not ignore_errors:
            LOGGER.warn('Please check PolyPhen-2 log file')
        else:
            LOGGER.error('Please check PolyPhen-2 log file')

    return output
示例#2
0
    def alignCustomPDB(self, PDB, chain='all', title=None, recover=False):
        """Aligns the Uniprot sequence with the sequence from the given PDB.
        """
        assert isinstance(PDB, (str, pd.Atomic)), \
            'PDB must be a PDBID or an Atomic instance (e.g. AtomGroup).'
        assert isinstance(chain, str) or all(
            isinstance(s, str) for s in chain), \
            "'chain' must be a string or a list of strings."
        assert isinstance(title, str) or title is None
        # parse/import pdb and assign title
        if isinstance(PDB, str):
            try:
                pdb = pd.parsePDB(PDB, subset='calpha')
            except Exception as e:
                msg = ('Unable to import PDB: PDB ID might be invalid or '
                       f'PDB file might be corrupted. Error message: {e}')
                LOGGER.error(msg)
            if title is None:
                title = os.path.basename(PDB.strip())
                title = title.replace(' ', '_')
        else:
            pdb = PDB.ca
            if title is None:
                title = PDB.getTitle()
        # check if a record is already present
        rec = [d for d in self.customPDBmappings if d['PDB'] == title]
        if recover and len(rec) > 1:
            raise RuntimeError('Multiple records found with same ID.')
        elif recover and len(rec) == 1:
            customPDBrecord = rec[0]
        else:
            # create record for custom PDB
            customPDBrecord = {
                'PDB': title,
                'chain_res': {},
                'chain_seq': {},
                'warnings': []
            }
            self.customPDBmappings.append(customPDBrecord)
        # check given chain list
        all_chains = set(pdb.getChids())
        if chain == 'all' or chain == '@':
            chains_to_align = list(all_chains)
        elif type(chain) is list:
            chains_to_align = chain
        else:
            chains_to_align = [
                chain,
            ]
        invalid_chIDs = [c for c in chains_to_align if c not in all_chains]
        if invalid_chIDs != []:
            raise ValueError('Invalid chain: {}.'.format(invalid_chIDs))
        # store resids and sequence of selected chains
        for c in chains_to_align:
            if c in customPDBrecord['chain_res']:
                continue

            customPDBrecord['chain_res'][c] = pdb[c].getResnums()
            customPDBrecord['chain_seq'][c] = pdb[c].getSequence()
        # align selected chains with BioPython module pairwise2
        self._calcCustomAlignments(title, chains_to_align)
        return customPDBrecord