예제 #1
0
def getSeqres(pdbid, chids):
    import prody
    sequences = []
    polymers = prody.parsePDBHeader(pdbid, 'polymers')
    for polymer in polymers:
        if polymer.chid in chids:
            sequences.append(polymer.sequence)
    return sequences
예제 #2
0
def iterate_over_pairs(pdbid, ref_chid):
    """
    iter<ref_chain, other_chain>
    """
    header = prody.parsePDBHeader(pdbid)
    for p in header['polymers']:
        if p.chid == ref_chid:
            continue
        yield (ref_chid, p.chid)
예제 #3
0
def split_receptor(bucket, table_idx, param,
                   datum):  # todo (maksym) param = params;
    try:  # todo (maksym) datum = pdb_name
        if type(datum).__name__ in ['tuple', 'list']:
            datum = datum[0]

        receptor = datum  # todo receptor = pdb_name !!!!!
        output_folder = param['output_folder']
        output_folder = '{}_{}'.format(table_idx, output_folder)
        input_download_folder = param['input_download_folder']

        input_pdb_dir = os.path.join(data_dir, input_download_folder)
        input_pdb_path = os.path.join(input_pdb_dir, receptor + '.pdb')

        parsed_pdb = prody.parsePDB(input_pdb_path)
        parsed_header = prody.parsePDBHeader(input_pdb_path)

        output_rec_dir = os.path.join(data_dir, output_folder, receptor)
        _makedir(output_rec_dir)

        ligands = []
        for chem in parsed_header['chemicals']:
            chain, resnum, resname = chem.chain, chem.resnum, chem.resname
            ligands.append([chain, str(resnum), resname])

        for chain, resnum, resname in ligands:
            try:
                rec = parsed_pdb.select('not (chain {} resnum {})'.format(
                    chain, resnum))
                rec = rec.select('not water')
                heavy_atom = rec.select('not hydrogen').numAtoms()
                rec_name = '_'.join(
                    [receptor, chain, resnum, resname, 'receptor']) + '.pdb'
                prody.writePDB(os.path.join(output_rec_dir, rec_name), rec)

                record = [
                    receptor, chain, resnum, resname, heavy_atom,
                    parsed_header['experiment'], parsed_header['resolution'],
                    1, 'success'
                ]
                records = [record]
                db.insert(table_idx, records, bucket=bucket)
            except Exception as e:
                record = [
                    receptor, chain, resnum, resname, 0, 0, 0, 0,
                    str(e)
                ]  # datum = failure_message
                records = [record]
                print(records)
                db.insert(table_idx, records, bucket=bucket)

    # TODO: (maksym) I believe this is controllable with logging
    except Exception as e:
        print(e)
        raise Exception(str(e))
예제 #4
0
def split(receptor, pdb_outpath, init='split_init'):

    init = eval(init)
    rec_dir = os.path.join(init.data_dir, init.rec_folder)
    lig_dir = os.path.join(init.data_dir, init.lig_folder)

    pdb_path = os.path.join(init.data_dir, pdb_outpath)
    parsed_pdb = prody.parsePDB(pdb_path)
    parsed_header = prody.parsePDBHeader(pdb_path)

    ligands = []
    for chem in parsed_header['chemicals']:
        ligands.append([chem.chain, str(chem.resnum), chem.resname])

    splited = []

    for chain, resnum, resname in ligands:

        lig = parsed_pdb.select('chain {} resnum {}'.format(chain, resnum))
        rec = parsed_pdb.select('not (chain {} resnum {})'.format(
            chain, resnum))
        if lig is None:
            continue
        resid = lig.getHierView().iterResidues().next().getResindex()
        resid = str(resid)
        heavy_lig = lig.select('not hydrogen')
        heavy_atom = heavy_lig.numAtoms()
        heavy_coord = heavy_lig.getCoords()
        #max_size_on_axis = max(heavy_coord.max(axis=0) - heavy_coord.min(axis=0))
        #Changing max_size_on_axis to max pairwise distance between coords
        #max_size_on_axis = max(scipy.spatial.distance.pdist(heavy_coord).tolist())
        lig_name = '_'.join([receptor, chain, resnum, resname, 'ligand'
                             ]) + '.pdb'
        if not os.path.exists(os.path.join(lig_dir, receptor)):
            os.makedirs(os.path.join(lig_dir, receptor))
        prody.writePDB(os.path.join(lig_dir, receptor, lig_name), lig)

        rec_name = '_'.join([receptor, chain, resnum, resname, 'receptor'
                             ]) + '.pdb'
        if not os.path.exists(os.path.join(rec_dir, receptor)):
            os.makedirs(os.path.join(rec_dir, receptor))
        prody.writePDB(os.path.join(rec_dir, receptor, rec_name), rec)

        splited.append([
            receptor,
            str(resname),
            os.path.join(init.rec_folder, receptor, rec_name),
            os.path.join(init.lig_folder, receptor, lig_name)
        ])

    return splited
예제 #5
0
def split_pdb(split_pdb,
              cuttoff_dist=None,
              discard_h=True,
              init="split_pdb_init"):
    """
    Iterates through every molecule of the ligand in the receptor. Crops atoms of the receptor (and any chemicals,
     but not water) within within the cutoff  distance of any atoms of the ligand. Saves ligand + binding site crop
     as pairs into folders.

    :param split_pdb: string (relative path the the file to split)
    :param cuttoff_dist: float (distance of any atoms in the binding site from any atom of the ligand to be saved)
    :param discard_h: Bool T/F (discard all hydrogens ?)
    :param init: string (init function in this module)
    :return:
    """
    init = eval(init)
    pdb_path = os.path.join(init.db_path, split_pdb)

    # parse header and coordinates from the PDB file
    pr_pdb = pr.parsePDB(pdb_path)
    pr_header = pr.parsePDBHeader(pdb_path)
    pdb_id = pr_header["identifier"]
    ligs = []

    # retrieve names of the chemicals from the header of the PDB file
    for chem in pr_header['chemicals']:
        ligs.append([chem.chain, str(chem.resnum), chem.resname])
    out_filenames = []
    for chain, resnum, resname in ligs:
        if discard_h:
            lig = pr_pdb.select('noh chain {} and resnum {}'.format(
                chain, resnum))
            rec = pr_pdb.select(
                'noh and not water not (chain {} resnum {})'.format(
                    chain, resnum))
        else:
            lig = pr_pdb.select('chain {} resnum {}'.format(chain, resnum))
            rec = pr_pdb.select('not water not (chain {} resnum {})'.format(
                chain, resnum))

        # write the ligand file and the receptor file
        pair_name = '_'.join([pdb_id, chain, resnum, resname])
        lig_name = pair_name + "_ligand.pdb"
        rec_name = pair_name + "_receptor.pdb"
        os.makedirs(os.path.join(init.db_path, init.split_dir, pair_name))
        lig_outpath = os.path.join(init.split_dir, pair_name, lig_name)
        rec_outpath = os.path.join(init.split_dir, pair_name, rec_name)
        pr.writePDB(os.path.join(init.db_path, lig_outpath), lig)
        pr.writePDB(os.path.join(init.db_path, rec_outpath), rec)
        out_filenames.append([lig_outpath, rec_outpath])
    return out_filenames
예제 #6
0
def download(bucket, table_idx, param,
             input_data):  # todo(maksym) input_data = pdb_ids
    '''
    Download pdb file from rcsb
    
    Args:
        table_idx: id for download table 
        param: dict
            {
                'output_folder':'...',
                ...
            }
        input_data: str pdb id 

    Returns:

    '''
    try:
        receptor = input_data[1:]  # todo(maksym) pdb_id
        output_folder = param['output_folder']
        output_folder_name = '{}_{}'.format(table_idx, output_folder)
        dest_dir = os.path.join(data_dir, output_folder_name)
        _makedir(dest_dir)
        pdb_path = os.path.join(dest_dir, receptor + '.pdb')

        #pdb_path = '/home/maksym/ryan/labeled_pdb/crystal_ligands/'+receptor+'/'+receptor+'.pdb'
        #print ('pdb',pdb_path)
        if not os.path.exists(pdb_path):
            download_address = 'https://files.rcsb.org/download/{}.pdb'.format(
                receptor)
            os.system('wget --no-check-certificate -P {} {}'.format(
                dest_dir, download_address))
        header = prody.parsePDBHeader(pdb_path)
        record = [
            receptor, header['experiment'], header['resolution'], 1, 'success'
        ]
        records = [record]
        db.insert(table_idx, records, bucket=bucket)
    except Exception as e:
        "Exception causing non success"
        print e
        record = [input_data, 'unk', 0, 0,
                  str(e)]  # todo maksym (unk) = failed
        records = [record]
        db.insert(table_idx, records, bucket=bucket)
예제 #7
0
def split_pdb(pdb_outpath,
              cutoff_dist=None,
              discard_h=True,
              init='split_pdb_init'):

    init = eval(init)
    pdb_path = os.path.join(init.data_dir, pdb_outpath)

    pr_pdb = pr.parsePDB(pdb_path)
    pr_header = pr.parsePDBHeader(pdb_path)
    pdb_id = pr_header['identifier']
    ligs = []

    for chem in pr_header['chemicals']:
        ligs.append([chem.chain, str(chem.resnum), chem.resname])

    out_filenames = []
    for chain, resnum, resname in ligs:
        if discard_h:
            lig = pr_pdb.select('noh chain {} and resnum {}'.format(
                chain, resnum))
            rec = pr_pdb.select(
                'noh and not water not (chain {} resnum {})'.format(
                    chain, resnum))
        else:
            lig = pr_pdb.select('chain{} resnum {}'.format(chain, resnum))
            rec = pr_pdb.select('not water not (chain {} resnum {})'.format(
                chain, resnum))

        pair_name = '_'.join([pdb_id, chain, resnum, resname])
        lig_name = pair_name + '_ligand.pdb'
        rec_name = pair_name + '_receptor.pdb'
        split_dir = os.path.join(init.data_dir, init.split_folder, pair_name)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)

        lig_outpath = os.path.join(init.split_folder, pair_name, lig_name)
        rec_outpath = os.path.join(init.split_folder, pair_name, rec_name)

        pr.writePDB(os.path.join(init.data_dir, lig_outpath), lig)
        pr.writePDB(os.path.join(init.data_dir, rec_outpath), rec)
        out_filenames.append([rec_outpath, lig_outpath])

    return out_filenames
예제 #8
0
파일: nano_action.py 프로젝트: flyuuu/core
def split(receptor, pdb_path, rec_dir, lig_dir):
    """
    split pdb into receptor and ligand

    args:
        receptor:: str
            4 letters pdb id

        pdb_path:: str
            path of download pdb file

        rec_dir:: str
            dir for output

        lig_dir:: str
            dir for output

    returns:
        receptor:: str
            4 letters pdb id

        resname:: str
            res id  

        rec_path:: str
            path of splited receptor

        lig_path:: str
            path of splited ligand      
    """

    parsed_pdb = prody.parsePDB(pdb_path)
    parsed_header = prody.parsePDBHeader(pdb_path)

    ligands = []
    for chem in parsed_header['chemicals']:
        ligands.append([chem.chain, str(chem.resnum), chem.resname])

    splited = []

    for chain, resnum, resname in ligands:

        lig = parsed_pdb.select('chain {} resnum {}'.format(chain, resnum))
        rec = parsed_pdb.select('not (chain {} resnum {})'.format(chain, resnum))
        if lig is None:
            continue
        resid = lig.getHierView().iterResidues().next().getResindex()
        resid = str(resid)
        heavy_lig = lig.select('not hydrogen')
        heavy_atom = heavy_lig.numAtoms()
        heavy_coord =heavy_lig.getCoords()
        #max_size_on_axis = max(heavy_coord.max(axis=0) - heavy_coord.min(axis=0))
        #Changing max_size_on_axis to max pairwise distance between coords
        #max_size_on_axis = max(scipy.spatial.distance.pdist(heavy_coord).tolist())
        lig_name = '_'.join([receptor,chain,resnum,resname,'ligand']) + '.pdb'
        if not os.path.exists(os.path.join(lig_dir,receptor)):
            os.makedirs(os.path.join(lig_dir,receptor))
        prody.writePDB(os.path.join(lig_dir,receptor, lig_name), lig)

        rec_name = '_'.join([receptor, chain, resnum, resname, 'receptor']) + '.pdb'
        if not os.path.exists(os.path.join(rec_dir,receptor)):
            os.makedirs(os.path.join(rec_dir,receptor))
        prody.writePDB(os.path.join(rec_dir,receptor, rec_name), rec)

        splited.append(['"'+receptor+'"','"'+str(resname)+'"','"'+os.path.join(rec_dir,receptor, rec_name)+'"','"'+os.path.join(lig_dir,receptor, lig_name)+'"'])


    return splited
예제 #9
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = "{http://pfam.xfam.org/}"
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile

        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = "".join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError("could not parse a sequence without gaps from " + query)
    else:
        seq = "".join(query.split())

    import xml.etree.cElementTree as ET

    LOGGER.timeit("_pfam")
    timeout = int(kwargs.get("timeout", 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + " is not a valid sequence")

            fseq = ">Seq\n" + seq
            parameters = {"hmmdb": "pfam", "seq": fseq}
            enc_params = urllib.urlencode(parameters)
            request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params)

            url = urllib2.urlopen(request).geturl() + "?output=xml"
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError("failed to parse results XML, check URL: " + url)
            matches = {}
            for child in root[0]:
                if child.tag == "hits":
                    accession = child.get("acc")
                    pfam_id = accession.split(".")[0]
                    matches[pfam_id] = {}
                    matches[pfam_id]["accession"] = accession
                    matches[pfam_id]["class"] = "Domain"
                    matches[pfam_id]["id"] = child.get("name")
                    matches[pfam_id]["locations"] = {}
                    matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore")
                    matches[pfam_id]["locations"]["end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["evalue"] = child.get("evalue")
                    matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0"
                    matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto")
                    matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom")
                    matches[pfam_id]["locations"]["significant"] = child[0].get("significant")
                    matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["type"] = "Pfam-A"
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader

            try:
                polymers = parsePDBHeader(seq[:4], "polymers")
            except Exception as err:
                LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != "UniProt":
                            continue
                        idcode = dbref.idcode
                        LOGGER.info(
                            "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid)
                        )
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq)))
                url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"
            else:
                url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml"

        else:
            url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"

    LOGGER.debug("Retrieving Pfam search results: " + url)
    xml = None
    while LOGGER.timing("_pfam") < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url)
    else:
        LOGGER.report("Pfam search completed in %.2fs.", "_pfam")

    if xml.find(b"There was a system error on your last request.") > 0:
        LOGGER.warn("No Pfam matches found for: " + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError("failed to parse results XML, check URL: " + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError("failed to parse results XML, check URL: " + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results["matches"]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib["accession"][:7]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

        if not re.search("^P(F|B)[0-9]{5}$", accession):
            raise ValueError("{0} does not match pfam accession" " format".format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault("locations", [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = "Query " + repr(query)
    else:
        query = "Query sequence"

    if matches:
        LOGGER.info(query + " matched {0} Pfam families.".format(len(matches)))
    else:
        LOGGER.info(query + " did not match any Pfam families.")
    return matches
예제 #10
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

	    fseq = '>Seq\n' + seq
	    parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
	    enc_params = urllib.urlencode(parameters)
	    request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

	    url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
		
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
	    matches = {}
	    for child in root[0]:
		    if child.tag == 'hits':
			    accession = child.get('acc')
			    pfam_id = accession.split('.')[0]
			    matches[pfam_id]={}
			    matches[pfam_id]['accession']=accession
			    matches[pfam_id]['class']='Domain'
			    matches[pfam_id]['id']=child.get('name')
			    matches[pfam_id]['locations']={}
			    matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
			    matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
			    matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
			    matches[pfam_id]['locations']['end']=child[0].get('alisqto')
			    matches[pfam_id]['locations']['evalue']=child.get('evalue')
			    matches[pfam_id]['locations']['evidence']='hmmer v3.0'
			    matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
			    matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
			    matches[pfam_id]['locations']['significant']=child[0].get('significant')	
			    matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
			    matches[pfam_id]['type']='Pfam-A'
	            return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
예제 #11
0
파일: pfam.py 프로젝트: uibcdf/ProDy
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request(
            'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = {'format': 'tsv'}
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace(
            'results', 'download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url)
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        try:
            #xml = urllib2.urlopen(result_request).read()
            tsv = urllib2.urlopen(result_request).read()
            # openURL(url, timeout=timeout).read()
        except:
            raise ValueError('No matching Pfam domains were found.')

        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
        #if child.tag == 'hits':
        # accession = child.get('acc')
        # pfam_id = accession.split('.')[0]
        # matches[pfam_id]={}
        # matches[pfam_id]['accession']=accession
        # matches[pfam_id]['class']='Domain'
        # matches[pfam_id]['id']=child.get('name')
        # matches[pfam_id]['locations']={}
        # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
        # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
        # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['evalue']=child.get('evalue')
        # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
        # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
        # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
        # matches[pfam_id]['locations']['significant']=child[0].get('significant')
        # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
        # matches[pfam_id]['type']='Pfam-A'
        # return matches

        if PY3K:
            tsv = tsv.decode()

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id] = {}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child[
                'Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()

            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'.format(
                                    idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND', 'RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' +
                                 seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
예제 #12
0
파일: pfam.py 프로젝트: npabon/ProDy
def searchPfam(query, search_b=False, skip_a=False, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.sanger.ac.uk/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

        urlextension = ''
        if kwargs:
            ga = int(kwargs.get('ga', 1))
            if not (ga == 1 or ga == 0):
                raise ValueError('ga must be either 0 or 1')

            evalue = kwargs.get('evalue', None)
            if evalue:
                if not float(evalue) <= 10.0:
                    raise ValueError('evalue must be a valid float < 10.0')
                urlextension = urlextension + '&evalue=' + str(evalue)
            else:
                urlextension = urlextension + '&ga=' + str(ga)

        search_b = int(bool(search_b))
        skip_a = int(bool(skip_a))
        if skip_a == 1:
            search_b = 1

        urlextension = urlextension + '&searchBs=' + str(search_b)
        urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) +
               urlextension + '&output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        try:
            url = dictElement(root[0], prefix)['result_url']
        except (IndexError, KeyError):
            raise ValueError('failed to parse results XML, check URL: ' + url)

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.sanger.ac.uk/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        #else:
        #    if xml:
        #        break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
예제 #13
0
def blast(pdb_path):

    cdir = os.getcwd()
    tdir = tempfile.mkdtemp()
    os.chdir(tdir)

    receptor = os.path.basename(os.path.splitext(pdb_path)[0])
    pdbHead = prody.parsePDBHeader(pdb_path)
    pdbFile = prody.parsePDB(pdb_path)

    ligands = []
    for chem in pdbHead['chemicals']:
        ligands.append([chem.chain, str(chem.resnum), chem.resname, chem.name])

    blast_result = []
    for chain, resnum, resname, name in ligands:

        rec = pdbFile.select('not (chain {} resnum {})'.format(chain, resnum))
        ligand = pdbFile.select('chain {} resnum {}'.format(chain, resnum))

        cen_ligand = prody.calcCenter(ligand)

        res_coll = []
        ligCoords = ligand.getCoords()
        print('lig_size', len(ligCoords))

        sequence = ''
        i = 4
        while len(sequence) < 100:

            for center in ligCoords:
                around_atoms = rec.select(
                    'same residue as within {} of center'.format(i),
                    center=center)
                if around_atoms is None:
                    continue
                res_coll.append(around_atoms)
                #res_indices = around_atoms.getResindices()
                #print(around_atoms.getHierView()['A'].getSequence())
                #print (res_indices)
                #res_coll = res_coll | set(res_indices)
            resindices = reduce(lambda x, y: x | y, res_coll)
            sequence = resindices.getHierView()['A'].getSequence()
            print('sequence', i, len(sequence), sequence)
            i += 1

        with open('sequence.fasta', 'w') as fout:
            fout.write(">receptor\n" + sequence + '\n')

        cmd = 'blastp -db {} -query sequence.fasta -outfmt 5 -out result'.format(
            BLASTDB)
        #print(os.getcwd())

        cl = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        cl.wait()

        #print(os.listdir(os.getcwd()))

        dtree = xml.dom.minidom.parse("result")
        collection = dtree.documentElement
        hits = collection.getElementsByTagName("Hit")

        hit_result = []

        for hit in hits:
            hit_id = hit.getElementsByTagName('Hit_id')[0].childNodes[0].data
            hsps = hit.getElementsByTagName('Hit_hsps')[0]
            identity = hsps.getElementsByTagName(
                'Hsp_identity')[0].childNodes[0].data
            align_len = hsps.getElementsByTagName(
                'Hsp_align-len')[0].childNodes[0].data
            qseq = hsps.getElementsByTagName('Hsp_qseq')[0].childNodes[0].data
            hseq = hsps.getElementsByTagName('Hsp_hseq')[0].childNodes[0].data
            midline = hsps.getElementsByTagName(
                'Hsp_midline')[0].childNodes[0].data

            blast_result.append([
                receptor, hit_id,
                str(identity),
                str(align_len),
                str(len(sequence)), midline, hseq, sequence
            ])

    return blast_result
예제 #14
0
파일: pfam.py 프로젝트: karolamik13/ProDy
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters)
        request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

        url = ( urllib.request.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
        
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
        matches = {}
        for child in root[0]:
            if child.tag == 'hits':
                accession = child.get('acc')
                pfam_id = accession.split('.')[0]
                matches[pfam_id]={}
                matches[pfam_id]['accession']=accession
                matches[pfam_id]['class']='Domain'
                matches[pfam_id]['id']=child.get('name')
                matches[pfam_id]['locations']={}
                matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['evalue']=child.get('evalue')
                matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                matches[pfam_id]['type']='Pfam-A'
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
def split_pdb(uid, pdb_file, init="split_pdb_init"):
    """ Iterates through every ligand molecule in the crystal structure (frequently there are a few). For each ligand
    selects and crops its binding site (any atom of Protein/DNA/Cofactor within the cutoff distance). Saves pairs of
    files: ligand + this ligand's binding site.

    Example:
    <pre lang="python">
    split_pdb('105M','download/105M.pdb')
    </pre>

    Output:
    <pre lang="python">
    [['05M_A_155_HEM','split/105M_A_155_HEM/105M_A_155_HEM_receptor.pdb','split/105M_A_155_HEM/105M_A_155_HEM_ligand.pdb',30,100]]
    </pre>

    :param pdb_file: string (relative path the the file to split)
    :param cutoff_dist: float (distance of any atoms in the binding site from any atom of the ligand to be saved)
    :param min_rec_atoms: minimun number of atoms be saved as binding site
    :param min_lig_atoms: minumum number of atoms for ligand to be saved
    :param init: string (init function in this module)
    :return: 
    nested list of pairs of file names of dimensions [num_pairs x [string,string,int,int]] or \
    [num_pairs x [lig_file,bindsite_file,lig_num_atoms,bindsite_num_atoms]]
    """

    init = eval(init)
    pdb_path = os.path.join(init.db_root, pdb_file)
    # parse PDB file
    pr_pdb = pr.parsePDB(pdb_path)
    assert pr_pdb.numAtoms() > (init.min_lig_atoms+init.min_rec_atoms), \
        "not enough atoms in this pdb" + str(pr_pdb.numAtoms())

    # parse header of the PDB file
    pr_header = pr.parsePDBHeader(pdb_path)
    pdb_id = pr_header["identifier"]
    ligs = []

    # retrieve names of the chemicals from the header of the PDB file
    for chem in pr_header['chemicals']:
        ligs.append([chem.chain, str(chem.resnum), chem.resname])
    out_filenames = []

    for lig_chain, lig_resnum, lig_resname in ligs:
        if init.discard_h:
            lig = pr_pdb.select('noh chain {} and resnum {}'.format(
                lig_chain, lig_resnum))
            rec = pr_pdb.select(
                'noh and not water not (chain {} resnum {})'.format(
                    lig_chain, lig_resnum))
        else:
            lig = pr_pdb.select('chain {} resnum {}'.format(
                lig_chain, lig_resnum))
            rec = pr_pdb.select('not water not (chain {} resnum {})'.format(
                lig_chain, lig_resnum))
        lig_coords = lig.getCoords()
        lig_atom_num = lig.numAtoms()

        # escape the loop without writing anything if the number of ligand atoms is too small
        if lig_atom_num < init.min_lig_atoms:
            continue

        # select residues of the binding site
        bindsite_Segindices = []
        bindsite_Chids = []
        bindsite_Resnums = []
        for atom_coord in lig_coords:
            around_atoms = rec.select(
                'same residue as within {} of center'.format(init.cutoff_dist),
                center=atom_coord)
            bindsite_Segindices = bindsite_Segindices + list(
                around_atoms.getSegindices())
            bindsite_Chids = bindsite_Chids + list(around_atoms.getChids())
            bindsite_Resnums = bindsite_Resnums + list(
                around_atoms.getResnums())

        # select only unique atoms
        bindsite_names = [
            str(bindsite_Segindices[i]) + str(bindsite_Chids[i]) +
            str(bindsite_Resnums[i]) for i in range(len(bindsite_Segindices))
        ]
        bindsite_names, unq_idx = np.unique(np.asarray(bindsite_names),
                                            return_index=True)
        segindices = np.asarray(bindsite_Segindices)[unq_idx]
        chids = np.asarray(bindsite_Chids)[unq_idx]
        resnums = np.asarray(bindsite_Resnums)[unq_idx]

        # proofcheck that ligand and receptor residues do not overlap
        lig_names = np.unique(
            np.asarray([
                str(lig.getSegindices()[i]) + str(lig.getChids()[i]) +
                str(lig.getResnums()[i]) for i in range(lig_atom_num)
            ]))
        assert len(np.intersect1d(bindsite_names, lig_names)
                   ) == 0, "broken selection: binding site and ligand overlap"

        # select the receptor atoms to save
        bindsite_resid = len(unq_idx)
        prody_cmd = " or ".join([
            "(segindex {} chid {} resnum {})".format(segindices[i], chids[i],
                                                     resnums[i])
            for i in range(bindsite_resid)
        ])
        binding_site = rec.select(prody_cmd)

        # write the ligand file and the receptor file
        pair_name = '_'.join(
            [uid, str(lig_chain),
             str(lig_resnum),
             str(lig_resname)])
        lig_name = pair_name + "_ligand.pdb"
        bindsite_name = pair_name + "_bindsite.pdb"
        os.makedirs(os.path.join(init.db_root, init.split_dir, pair_name))
        lig_outpath = os.path.join(init.split_dir, pair_name, lig_name)
        bindsite_outpath = os.path.join(init.split_dir, pair_name,
                                        bindsite_name)
        pr.writePDB(os.path.join(init.db_root, lig_outpath), lig)
        pr.writePDB(os.path.join(init.db_root, bindsite_outpath), binding_site)
        out_filenames.append([
            pair_name, lig_outpath, bindsite_outpath,
            lig.numAtoms(),
            binding_site.numAtoms()
        ])
    return out_filenames
예제 #16
0
파일: pfam.py 프로젝트: fongchun/ProDy
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = { 'format' : 'tsv' }
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace('results','download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url) 
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        #xml = urllib2.urlopen(result_request).read()
        tsv = urllib2.urlopen(result_request).read()
        # openURL(url, timeout=timeout).read()
        
        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
            #if child.tag == 'hits':
                # accession = child.get('acc')
                # pfam_id = accession.split('.')[0]
                # matches[pfam_id]={}
                # matches[pfam_id]['accession']=accession
                # matches[pfam_id]['class']='Domain'
                # matches[pfam_id]['id']=child.get('name')
                # matches[pfam_id]['locations']={}
                # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['evalue']=child.get('evalue')
                # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                # matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                # matches[pfam_id]['type']='Pfam-A'
        # return matches

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id]={}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']   
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
 
            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'
                                .format(idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND','RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' + seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
예제 #17
0
def split_ligand(bucket, table_idx, param, input_data):
    '''
    Split ligand form PDB file
    
    Parse PDB header, ligand records under key 'chemicals', get the 
    
    
    Args:
        table_idx: id for split ligand table 
        param: dict
            {
                'output_folder':'...',
                'input_download_folder':'...',
            }
        input_data: [str], (str) or str
             ['1a2b'] ,('3eml') ,'3eln'
            
    Returns:

    '''
    try:
        if type(input_data).__name__ in ['tuple', 'list']:
            input_data = input_data[0]  # do not allow x = x[0]

        receptor = input_data  # todo (maksym) better representation of the input data

        #fit_box_size = param['fit_box_size']

        output_folder = param['output_folder']
        output_folder = '{}_{}'.format(table_idx, output_folder)
        input_download_folder = param['input_download_folder']
        pdb_dir = os.path.join(
            data_dir, input_download_folder
        )  # todo (maksym) download_folder = source_folder
        pdb_path = os.path.join(pdb_dir, receptor + '.pdb')

        parsed_pdb = prody.parsePDB(pdb_path)
        parsed_header = prody.parsePDBHeader(pdb_path)

        output_lig_dir = os.path.join(
            data_dir, output_folder,
            receptor)  # todo(maksym) datadir is config.data_dir
        _makedir(output_lig_dir)

        ligands = []
        for chem in parsed_header['chemicals']:
            ligands.append([chem.chain, str(chem.resnum), chem.resname])

        for chain, resnum, resname in ligands:
            try:
                lig = parsed_pdb.select('chain {} resnum {}'.format(
                    chain, resnum))
                resid = lig.getHierView().iterResidues().next().getResindex()
                resid = str(resid)
                heavy_lig = lig.select('not hydrogen')
                heavy_atom = heavy_lig.numAtoms()
                heavy_coord = heavy_lig.getCoords()
                #max_size_on_axis = max(heavy_coord.max(axis=0) - heavy_coord.min(axis=0))
                #Changing max_size_on_axis to max pairwise distance between coords
                max_size_on_axis = max(
                    scipy.spatial.distance.pdist(heavy_coord).tolist())
                lig_name = '_'.join(
                    [receptor, chain, resnum, resname, 'ligand']) + '.pdb'
                prody.writePDB(os.path.join(output_lig_dir, lig_name), lig)

                record = [
                    receptor, chain, resnum, resname, resid, heavy_atom,
                    max_size_on_axis, 1, 'success'
                ]
                records = [record]
                db.insert(table_idx, records, bucket=bucket)
            except Exception as e:
                record = [receptor, chain, resnum, resname, 0, 0, 0, 0, str(e)]
                records = [record]
                db.insert(table_idx, records, bucket=bucket)

    except Exception as e:
        print(e)
        raise Exception(str(e))