class ChEMBLTargetCofactorProvider(StashableBase):
    """Accessors for ChEMBL target cofactors."""

    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirName = "ChEMBL-cofactors"
        super(ChEMBLTargetCofactorProvider, self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, **kwargs)
        #

    def testCache(self, minCount=1):
        logger.info("ChEMBL cofactor count %d", len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0)
        if self.__fD and "cofactors" in self.__fD and len(self.__fD["cofactors"]) > minCount:
            return True
        else:
            return False

    def hasTarget(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["cofactors"]

    def getTargets(self, rcsbEntityId):
        try:
            return self.__fD["cofactors"][rcsbEntityId.upper()]
        except Exception:
            return []

    def __getCofactorDataPath(self):
        return os.path.join(self.__dirPath, "ChEMBL-cofactor-data.json")

    def reload(self):
        self.__fD = self.__reload(self.__dirPath, useCache=True)
        return True

    def __reload(self, dirPath, **kwargs):
        startTime = time.time()
        fD = {}
        useCache = kwargs.get("useCache", True)
        ok = False
        cofactorPath = self.__getCofactorDataPath()
        #
        logger.info("useCache %r cofactorPath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
        return fD

    def buildCofactorList(self, sequenceMatchFilePath, crmpObj=None, lnmpObj=None, maxActivity=5):
        """Build target cofactor list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path
            crmpObj (obj, optional): instance of ChemRefMappingProviderObj()
            lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None.
            maxActivity (int, optional): maximum number of prioritized activity records per target

        Returns:
            bool: True for success or False otherwise

            Example activity record -

                        "CHEMBL3243": [
                    {
                        "assay_chembl_id": "CHEMBL655768",
                        "assay_description": "In vitro inhibitory activity against recombinant human CD45 using fluorescein diphosphate (FDP) as a substrate",
                        "assay_type": "B",
                        "canonical_smiles": "COC(=O)c1ccc(C2=CC(=O)C(=O)c3ccccc32)cc1",
                        "ligand_efficiency": {
                        "bei": "19.78",
                        "le": "0.36",
                        "lle": "3.11",
                        "sei": "9.57"
                        },
                        "molecule_chembl_id": "CHEMBL301254",
                        "parent_molecule_chembl_id": "CHEMBL301254",
                        "pchembl_value": "5.78",
                        "standard_relation": "=",
                        "standard_type": "IC50",
                        "standard_units": "nM",
                        "standard_value": "1650.0",
                        "target_chembl_id": "CHEMBL3243"
                    },
        """
        rDL = []
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        #
        chP = ChEMBLTargetProvider(cachePath=self.__cachePath, useCache=False)
        # ---
        chaP = ChEMBLTargetActivityProvider(cachePath=self.__cachePath, useCache=True)
        #
        provenanceSource = "ChEMBL"
        refScheme = "PDB entity"
        assignVersion = chP.getAssignmentVersion()
        for queryId, matchDL in mD.items():
            qCmtD = self.__decodeComment(queryId)
            unpId = qCmtD["uniprotId"]
            queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None
            chemblIdL = qCmtD["chemblId"].split(",")
            if queryTaxId == "-1":
                logger.info("Skipping target with missing taxonomy %r (%r)", unpId, chemblIdL)
                continue
            queryName = chP.getTargetDescription(unpId)
            for chemblId in chemblIdL:
                if not chaP.hasTargetActivity(chemblId):
                    logger.debug("Skipping target %r (%r)", unpId, chemblId)
                    # continue
                # --
                chemCompNeighborsD = {}
                if lnmpObj:
                    for matchD in matchDL:
                        tCmtD = self.__decodeComment(matchD["target"])
                        entryId = tCmtD["entityId"].split("_")[0]
                        entityId = tCmtD["entityId"].split("_")[1]
                        rcsbEntityId = entryId + "_" + entityId
                        chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId)
                        chemCompNeighborsD.update({k: True for k in chemCompIdList})
                # --
                for matchD in matchDL:
                    tCmtD = self.__decodeComment(matchD["target"])
                    entryId = tCmtD["entityId"].split("_")[0]
                    entityId = tCmtD["entityId"].split("_")[1]
                    #
                    taDL = chaP.getTargetActivity(chemblId)
                    logger.debug("Target %r has (%d) activity records", chemblId, len(taDL))
                    # ---
                    actL = []
                    for taD in taDL:
                        if taD["assay_type"] in ["B", "F"]:
                            try:
                                if taD["standard_units"] == "nM" and taD["standard_value"] and float(taD["standard_value"]) > 0.0:
                                    pV = -math.log10(float(taD["standard_value"]) * 10.0e-9)
                                    actD = {
                                        "cofactor_id": taD["molecule_chembl_id"],
                                        "assay_id": taD["assay_chembl_id"],
                                        "assay_description": taD["assay_description"],
                                        "measurement_type": "p" + taD["standard_type"],
                                        "measurement_value": round(pV, 2),
                                        "smiles": taD["canonical_smiles"],
                                        "molecule_name": taD["molecule_name"],
                                        "inchi_key": taD["inchi_key"],
                                        "action": taD["action"],
                                        "moa": taD["moa"],
                                        "max_phase": taD["max_phase"],
                                    }
                                    actD = self.__addLocalIds(actD, crmpObj=crmpObj)
                                    actL.append(actD)
                            except Exception as e:
                                logger.debug("Failing for tAD %r with %s", taD, str(e))

                    # ---
                    actL = self.__activityListSelect(actL, chemCompNeighborsD, maxActivity=maxActivity)
                    if not actL:
                        logger.debug("No ChEMBL cofactors for %s %s", chemblId, unpId)
                    # ---
                    # aligned_target.entity_beg_seq_id (current target is PDB entity in json)
                    # aligned_target.target_beg_seq_id (current query is target seq in json)
                    # aligned_target.length
                    fpL = []
                    if "alignedRegions" in matchD:
                        fpL = [
                            {
                                "entity_beg_seq_id": arD["targetBegin"],
                                "target_beg_seq_id": arD["queryBegin"],
                                "length": arD["targetEnd"] - arD["targetBegin"],
                            }
                            for arD in matchD["alignedRegions"]
                        ]
                    else:
                        fpL = [
                            {
                                "entity_beg_seq_id": matchD["targetBegin"],
                                "target_beg_seq_id": matchD["queryBegin"],
                                "length": matchD["alignLen"],
                            }
                        ]
                    # ---
                    rD = {
                        "entry_id": entryId,
                        "entity_id": entityId,
                        "query_uniprot_id": unpId,
                        "query_id": chemblId,
                        "query_id_type": "ChEMBL",
                        "query_name": queryName,
                        "provenance_source": provenanceSource,
                        "reference_scheme": refScheme,
                        "assignment_version": assignVersion,
                        "query_taxonomy_id": int(queryTaxId) if queryTaxId else None,
                        "target_taxonomy_id": int(matchD["targetTaxId"]) if "targetTaxId" in matchD else None,
                        #
                        "aligned_target": fpL,
                        #
                        "taxonomy_match_status": matchD["taxonomyMatchStatus"] if "taxonomyMatchStatus" in matchD else None,
                        "lca_taxonomy_id": matchD["lcaTaxId"] if "lcaTaxId" in matchD else None,
                        "lca_taxonomy_name": matchD["lcaTaxName"] if "lcaTaxName" in matchD else None,
                        "lca_taxonomy_rank": matchD["lcaRank"] if "lcaRank" in matchD else None,
                        "cofactors": actL,
                    }
                    rDL.append(rD)
            #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        #
        fp = self.__getCofactorDataPath()
        tS = datetime.datetime.now().isoformat()
        # vS = datetime.datetime.now().strftime("%Y-%m-%d")
        vS = assignVersion
        ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "cofactors": qD}, fmt="json", indent=3)
        return ok

    def __addLocalIds(self, cfD, crmpObj=None):
        #
        if crmpObj:
            localIdL = crmpObj.getLocalIds("CHEMBL", cfD["cofactor_id"])
            if localIdL:
                localId = localIdL[0]
                if localId.startswith("PRD_"):
                    cfD["prd_id"] = localId
                else:
                    cfD["chem_comp_id"] = localId
        return cfD

    def __activityListSelect(self, activityDL, chemCompNeighborsD, maxActivity=5):
        retL = []
        mappedNeighborL = []
        unmappedL = activityDL
        #
        if chemCompNeighborsD:
            unmappedL = []
            # Select out the any cases for molecules that map to a neighbor chemical component.
            for activityD in activityDL:
                if "chem_comp_id" in activityD and activityD["chem_comp_id"] in chemCompNeighborsD:
                    activityD["neighbor_in_pdb"] = "Y"
                    mappedNeighborL.append(activityD)
                else:
                    unmappedL.append(activityD)
                    activityD["neighbor_in_pdb"] = "N"
        #
        numLeft = maxActivity - len(mappedNeighborL)
        if numLeft > 0:
            unmappedL = sorted(unmappedL, key=lambda k: k["measurement_value"], reverse=True)
            retL = mappedNeighborL
            retL.extend(unmappedL[:numLeft])
            retL = sorted(retL, key=lambda k: k["measurement_value"], reverse=True)
        else:
            logger.debug("Mapped neighbor cofactors (%d) excluded unmapped (%d)", len(mappedNeighborL), len(unmappedL))
            retL = sorted(mappedNeighborL, key=lambda k: k["measurement_value"], reverse=True)
        return retL

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD
class DictMethodRunnerTests(unittest.TestCase):
    def setUp(self):
        self.__export = True
        self.__numProc = 2
        self.__fileLimit = 200
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__testCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": 50,
                "mergeContent": ["vrpt"]
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": 17,
                "mergeContent": None
            },
        ]
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __runContentType(self, contentType, mockLength, mergeContent):
        """Read and process test fixture data files from the input content type."""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContent)
            containerList = self.__rpP.getContainerList(locatorObjList)
            #
            logger.debug("Length of locator list %d\n", len(locatorObjList))
            self.assertGreaterEqual(len(locatorObjList), mockLength)
            for container in containerList:
                cName = container.getName()
                #
                # if cName not in ["1B5F"]:
                #    continue
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMethodRunner(self):
        """Test method runner for multiple content types."""
        for tD in self.__testCaseList:
            self.__runContentType(tD["contentType"], tD["mockLength"],
                                  tD["mergeContent"])

    def testMethodRunnerSetup(self):
        """Test the setup methods for method runner class"""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName("pdbx")
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            ok = dmh is not None
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Пример #3
0
    def assemble(self, maxRFactor=10.0):
        """Concatenate models into the input file path subject to the R value constraint.
        Relabel the models sequentially for each parent chemical component.

        Args:
            assembleModelPath (str): path for concatenated model file
            maxRFactor (float, optional): limiting R-value. Defaults to 10.0.

        Returns:
            (bool): True for success or False otherwise

        """
        dataContainerL = []
        mU = MarshalUtil(workPath=self.__cachePath)
        # combine CCDC and COD model build index files
        modelIndexD = self.__ccdcmb.fetchModelIndex()
        codD = self.__codmb.fetchModelIndex()
        for pId, mDL in codD.items():
            if pId in modelIndexD:
                modelIndexD[pId] += codD[pId]
            else:
                modelIndexD[pId] = codD[pId]
        #
        modelIndexD = self.__addPriorMatchDetails(modelIndexD)
        modelIndexD = self.__updateVariantDetails(modelIndexD)
        priorMapD = {}
        for _, mDL in modelIndexD.items():
            try:
                mDLS = sorted(mDL,
                              key=itemgetter("priorModelId", "variantType",
                                             "rFactor"),
                              reverse=False)
            except Exception:
                mDLS = sorted(mDL,
                              key=itemgetter("priorModelId", "variantType"),
                              reverse=False)
            numStd = 0
            matchIdD = {}
            for mD in mDLS:
                isStd = False
                if mD["variantType"].startswith("A"):
                    numStd += 1
                    isStd = True
                #
                if "rFactor" in mD and mD[
                        "rFactor"] and mD["rFactor"] > maxRFactor:
                    logger.info("Skipping model %s isStd (%r) rValue (%r)",
                                mD["modelId"], isStd, mD["rFactor"])
                    continue
                if numStd and not isStd:
                    logger.info("Skipping model %s isStd (%r) numStd (%d)",
                                mD["modelId"], isStd, numStd)
                    continue
                #
                # Exclude duplicate matches in priority order ...
                if mD["matchId"] in matchIdD:
                    logger.info("Skipping duplicate matchId %r in %r",
                                mD["matchId"], mD["modelId"])
                    continue
                #
                matchIdD[mD["matchId"]] = True

                cL = mU.doImport(mD["modelPath"], fmt="mmcif")
                logger.debug("Read %d from %s", len(cL), mD["modelPath"])
                dataContainerL.extend(cL)
                if not mD["priorModelId"].startswith("Z"):
                    priorMapD[mD["modelId"]] = (mD["priorModelId"],
                                                mD["priorMatchDate"])
        #
        logger.debug("priorMapD %r", priorMapD)
        fn = "chem_comp_models-%s.cif" % self.__getToday()
        assembleModelPath = os.path.join(self.__ccdcmb.getModelDirFilePath(),
                                         fn)
        # -- relabel
        parentModelCountD = defaultdict(int)
        priorIdLD = {}
        for dataContainer in dataContainerL:
            tModelId = dataContainer.getName()
            tId = self.__parseId(tModelId)[0]
            pId = tId.split("|")[0]
            if tModelId in priorMapD:
                pCount = self.__parseId(priorMapD[tModelId][0])[1]
                priorIdLD.setdefault(pId, []).append(pCount)
                self.__replaceModelId(dataContainer, tModelId,
                                      priorMapD[tModelId][0])
                self.__updateAuditDate(dataContainer, priorMapD[tModelId][1])
                parentModelCountD[pId] = sorted(priorIdLD[pId])[-1]
                logger.debug("%s current model %r prior model %r count %d",
                             pId, tModelId, priorMapD[tModelId][0],
                             parentModelCountD[pId])
            else:
                parentModelCountD[pId] += 1
                pModelId = self.__makePublicModelId(pId,
                                                    parentModelCountD[pId])
                self.__replaceModelId(dataContainer, tModelId, pModelId)

        ok = mU.doExport(assembleModelPath, dataContainerL, fmt="mmcif")
        logger.info("Assembled %d models status %r", len(dataContainerL), ok)
        self.__checkAssembledModels(assembleModelPath)
        return len(dataContainerL)
Пример #4
0
class ChemCompSearchIndexProvider(object):
    """Utilities to read and process the index of chemical component definitions search targets"""

    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(self.__cachePath, "chem_comp")
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
        self.__searchIdx = self.__reload(**kwargs)

    def testCache(self, minCount=None, logSizes=False):
        if logSizes and self.__searchIdx:
            logger.info("searchIdxD (%.2f MB)", getObjSize(self.__searchIdx) / 1000000.0)
        ok = self.__searchIdx and len(self.__searchIdx) >= minCount if minCount else self.__searchIdx is not None
        return ok

    def getIndex(self):
        return self.__searchIdx

    def getIndexEntry(self, searchCcId):
        try:
            return self.__searchIdx[searchCcId]
        except Exception as e:
            logger.debug("Get index entry %r failing with %s", searchCcId, str(e))
        return None

    def getIndexFilePath(self):
        return os.path.join(self.__dirPath, "%s-search-idx-chemical-components.json" % self.__ccFileNamePrefix)

    def __reload(self, **kwargs):
        """Reload or created index of PDB chemical components.

        Args:
            cachePath (str): path to the directory containing cache files
            ccIdxFileName (str): serialized chemical component data index file name

         Returns:
            (list): chemical component data containers
        """
        #
        searchIdxD = {}
        useChemAxon = kwargs.get("useChemAxon", True)
        useCache = kwargs.get("useCache", True)
        molLimit = kwargs.get("molLimit", 0)
        numProc = kwargs.get("numProc", 1)
        maxChunkSize = kwargs.get("maxChunkSize", 20)
        limitPerceptions = kwargs.get("limitPerceptions", True)
        quietFlag = kwargs.get("quietFlag", True)
        skipObsolete = kwargs.get("skipObsolete", True)
        searchIdxFilePath = self.getIndexFilePath()
        #
        if useCache and self.__mU.exists(searchIdxFilePath):
            _, fExt = os.path.splitext(searchIdxFilePath)
            searchIdxFormat = "json" if fExt == ".json" else "pickle"
            rdCcIdxD = self.__mU.doImport(searchIdxFilePath, fmt=searchIdxFormat)
            searchIdxD = {k: rdCcIdxD[k] for k in sorted(rdCcIdxD.keys())[:molLimit]} if molLimit else rdCcIdxD
        else:
            cmpKwargs = {k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"]}
            ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath, useCache=True, molLimit=molLimit, skipObsolete=skipObsolete, **cmpKwargs)
            ok1 = ccmP.testCache(minCount=molLimit, logSizes=True)
            #
            descrD = {}
            ok2 = True
            if useChemAxon:
                caxP = ChemAxonDescriptorProvider(cachePath=self.__cachePath, useCache=True, **cmpKwargs)
                ok2 = caxP.testCache(minCount=molLimit)
                descrD = caxP.getDescriptorIndex()
            #
            if ok1 & ok2:
                searchIdxD = self.__updateChemCompSearchIndex(ccmP.getMolD(), descrD, searchIdxFilePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag)
                logger.info("Storing %s with data for %d search candidates (status=%r) ", searchIdxFilePath, len(searchIdxD), ok1 & ok2)
        # logger.info("Using Chemaxon descriptors for (%d) components", descrD)
        #
        for idxD in searchIdxD.values():
            idxD["atom-types"] = set(idxD["type-counts"].keys()) if "type-counts" in idxD else set()

        return searchIdxD

    def __updateChemCompSearchIndex(self, ccObjD, descrD, filePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag):
        searchIdxD = {}
        try:
            # Serialized index of chemical component search targets
            startTime = time.time()
            _, fExt = os.path.splitext(filePath)
            fileFormat = "json" if fExt == ".json" else "pickle"
            if numProc <= 1:
                searchIdxD = self.__buildChemCompSearchIndex(ccObjD, descrD, limitPerceptions=limitPerceptions, molLimit=molLimit)
            else:
                searchIdxD = self.__buildChemCompSearchIndexMulti(
                    ccObjD, descrD, limitPerceptions=limitPerceptions, molLimit=molLimit, numProc=numProc, maxChunkSize=maxChunkSize, quietFlag=quietFlag
                )

            ok = self.__mU.doExport(filePath, searchIdxD, fmt=fileFormat)
            endTime = time.time()
            logger.info("Storing %s (%s) with %d search definitions (status=%r) (%.4f seconds)", filePath, fileFormat, len(searchIdxD), ok, endTime - startTime)
        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return searchIdxD

    def __buildChemCompSearchIndex(self, ccObjD, descrD, limitPerceptions=False, molLimit=None):
        """Internal method return a dictionary of extracted chemical component descriptors and formula."""
        rD = {}
        try:
            for ii, ccId in enumerate(ccObjD, 1):
                if molLimit and ii > molLimit:
                    break
                # ----
                oemf = OeMoleculeFactory()
                oemf.setQuiet()
                tId = oemf.setChemCompDef(ccObjD[ccId])
                if tId != ccId:
                    logger.error("%s chemical component definition import error", ccId)
                # ----
                oemf.clearExternalDescriptors()
                for smi in descrD[ccId] if ccId in descrD else []:
                    oemf.addExternalDescriptor("smiles", smi, "chemaxon-smiles")
                # ----
                smiD = oemf.buildRelated(limitPerceptions=limitPerceptions)
                logger.debug("%s related molecular forms %d", ccId, len(smiD))
                rD.update(smiD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return rD

    def __buildChemCompSearchIndexMulti(self, ccObjD, descrD, limitPerceptions=False, molLimit=None, numProc=2, maxChunkSize=20, quietFlag=False):
        #
        ccIdList = sorted(ccObjD.keys())[:molLimit] if molLimit else sorted(ccObjD.keys())
        logger.info("Input definition length %d numProc %d limitPerceptions %r", len(ccIdList), numProc, limitPerceptions)
        #
        rWorker = ChemCompSearchIndexWorker(ccObjD)
        # mpu = MultiProcPoolUtil(verbose=True)
        mpu = MultiProcUtil(verbose=True)
        optD = {"maxChunkSize": maxChunkSize, "limitPerceptions": limitPerceptions, "quietFlag": quietFlag, "descrD": descrD}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="buildRelatedList")
        ok, failList, resultList, _ = mpu.runMulti(dataList=ccIdList, numProc=numProc, numResults=1, chunkSize=maxChunkSize)
        if failList:
            logger.info("Index definitions with failures (%d): %r", len(failList), failList)
        logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0]))
        # JDW
        rD = {vD["name"]: vD for vD in resultList[0]}
        return rD

    def matchMolecularFormulaRange(self, typeRangeD, matchSubset=False):
        """Find matching formula for the input atom type range query (evaluates min <= ff <= max).

        Args:
            typeRangeD (dict): dictionary of element ranges {'<element_name>: {'min': <int>, 'max': <int>}}
            matchSubset (bool, optional): test for formula subset (default: False)

        Returns:
            (list):  chemical component identifiers with matching formula (MatchResults)
        """
        rL = []
        try:
            if not typeRangeD:
                return rL
            myTypeRangeD = {k.upper(): v for k, v in typeRangeD.items()}
            queryTypeS = set(myTypeRangeD.keys())
            for ccId, idxD in self.__searchIdx.items():
                tD = idxD["type-counts"]
                # targetTypeS = set(tD.keys())
                if not matchSubset and idxD["atom-types"] != queryTypeS:
                    continue
                #
                if not queryTypeS.issubset(idxD["atom-types"]):
                    continue
                match = True
                for atomType, rangeD in myTypeRangeD.items():
                    try:
                        if ("min" in rangeD and rangeD["min"] > tD[atomType]) or ("max" in rangeD and rangeD["max"] < tD[atomType]):
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                if match:
                    # logger.info("%s formula %r query %r", ccId, idxD["type-counts"], typeRangeD)
                    rL.append(MatchResults(ccId=ccId, searchType="formula", formula=idxD["formula"]))
        except Exception as e:
            logger.exception("Failing for %r with %s", typeRangeD, str(e))
        return rL

    def filterMinimumMolecularFormula(self, typeCountD):
        """Find molecules with the minimum formula composition for the input atom type query (evaluates min <= ff).

        Args:
            typeCountD (dict): dictionary of element minimum values {'<element_name>: #}

        Returns:
            (list):  chemical component identifiers
        """
        rL = []
        try:
            if not typeCountD:
                return list(self.__searchIdx.keys())

            queryTypeS = set(typeCountD.keys())
            for ccId, idxD in self.__searchIdx.items():
                tD = idxD["type-counts"]
                if not queryTypeS.issubset(tD):
                    continue
                match = True
                for atomType, minCount in typeCountD.items():
                    try:
                        if minCount > tD[atomType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                if match:
                    rL.append(ccId)
        except Exception as e:
            logger.exception("Failing for %r with %s", typeCountD, str(e))
        return rL

    def filterMinimumFormulaAndFeatures(self, typeCountD, featureCountD):
        """Find molecules with the minimum formula and feature composition.

        Args:
            typeCountD (dict): dictionary of element minimum values {'<element_name>: #}
            featureCountD (dict): dictionary of feature minimum values {'<element_name>: #}

        Returns:
            (list):  chemical component identifiers
        """
        rL = []
        try:
            if not typeCountD or not featureCountD:
                return list(self.__searchIdx.keys())
            # ----
            featureQueryS = set(featureCountD.keys())
            typeQueryS = set(typeCountD.keys())
            #
            for ccId, idxD in self.__searchIdx.items():
                tD = idxD["type-counts"]
                fD = idxD["feature-counts"]
                #
                if not typeQueryS.issubset(tD) or not featureQueryS.issubset(fD):
                    continue

                match = True
                for atomType, minCount in typeCountD.items():
                    try:
                        if minCount > tD[atomType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break

                if not match:
                    continue
                #
                for featureType, minCount in featureCountD.items():
                    try:
                        if minCount > fD[featureType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                #
                if match:
                    rL.append(ccId)
        except Exception as e:
            logger.exception("Failing for %r with %s", typeCountD, str(e))
        return rL
Пример #5
0
class ChemCompDepictWrapper(SingletonClass):
    """Wrapper for chemical component depiction operations."""
    def __init__(self):
        self.__startTime = time.time()
        # ---
        self.__workPath = "."
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__configD = None
        self.__cachePath = None
        # ---
        self.__statusDescriptorError = -100
        self.__searchError = -200
        self.__searchSuccess = 0
        self.__imageCount = 0

    def readConfig(self, resetImagePath=True):
        #
        ok = False
        try:
            self.__cachePath = os.environ.get("CHEM_DEPICT_CACHE_PATH", ".")
            configFileName = os.environ.get("CHEM_DEPICT_CONFIG_FILE_NAME",
                                            "depict-config.json")
            #
            configFilePath = os.path.join(self.__cachePath, "config",
                                          configFileName)
            configD = {}
            if self.__mU.exists(configFilePath):
                configD = self.__mU.doImport(configFilePath, fmt="json")
            logger.debug("configD: %r", configD)
            if configD and (len(configD) >= 2) and float(
                    configD["versionNumber"]) > 0.1:
                logger.info("Read version %r sections %r from %s",
                            configD["versionNumber"], list(configD.keys()),
                            configFilePath)
                ok = True
                #
                if resetImagePath:
                    # Allow the configuration to be relocatable.
                    tS = configD[
                        "imageDir"] if "imageDir" in configD else "images"
                    configD["imageDirPath"] = os.path.join(
                        self.__cachePath, tS)
                    configD["versionNumber"] = "0.2"
            else:
                # Handle missing config for now
                configD["imageDir"] = "images"
                configD["imageDirPath"] = os.path.join(self.__cachePath,
                                                       configD["imageDir"])
                logger.warning("Reading config file fails from path %r",
                               configFilePath)
                logger.warning("Using config %r", configD)
                ok = True
            #
            self.__configD = configD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False
        return ok

    def setConfig(self, cachePath, **kwargs):
        """Provide dependencies for rebuilding depict file dependencies.

        Args:
            cachePath (str): path to cache data files.

            Other options are propagated to configurations of the wrapped classes in __bootstrapDepictConfig()

        """
        self.__configD = self.__makeBootstrapDepictConfig(cachePath, **kwargs)
        return len(self.__configD) >= 2

    def __makeBootstrapDepictConfig(self, cachePath, **kwargs):
        """Create depict configuration bootstrap file"""
        configD = {}
        try:
            storeConfig = kwargs.get("storeConfig", True)
            os.environ["CHEM_DEPICT_CACHE_PATH"] = os.path.join(cachePath)
            configDirPath = os.path.join(cachePath, "config")
            configFilePath = os.path.join(configDirPath, "depict-config.json")
            #
            logger.info("Updating depict configuration using %s",
                        configFilePath)
            #
            imageDirPath = os.path.join(cachePath, "images")
            self.__mU.mkdir(imageDirPath)
            configD = {"versionNumber": 0.20, "imageDir": "images"}
            if storeConfig:
                self.__mU.mkdir(configDirPath)
                self.__mU.doExport(configFilePath,
                                   configD,
                                   fmt="json",
                                   indent=3)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return configD
        #

    def setImageCount(self, imageCount):
        self.__imageCount = imageCount

    def getImageCount(self):
        return self.__imageCount

    def __makeImagePath(self):
        imageDirPath = self.__configD[
            "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "."
        fileRotateIncrement = self.__configD[
            "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50
        ic = self.__imageCount % fileRotateIncrement
        imagePath = os.path.join(imageDirPath, "image-%s.svg" % ic)
        return imagePath

    def depictMolecule(self,
                       identifier,
                       identifierType,
                       imagePath=None,
                       **kwargs):
        """Create depiction from InChI, SMILES descriptors or PDB identifier."""
        try:
            imagePath = imagePath if imagePath else self.__makeImagePath()
            oeio = OeIoUtils()
            if identifierType.lower() in ["smiles"]:
                oeMol = oeio.smilesToMol(identifier)
            elif identifierType.lower() in ["inchi"]:
                oeMol = oeio.inchiToMol(identifier)
            elif identifierType.lower() in ["identifierpdb"]:
                ccsw = ChemCompSearchWrapper()
                oesmP = ccsw.getSearchMoleculeProvider()
                oeMol = oesmP.getMol(identifier)
            #
            ok = self.__depictOne(oeMol, imagePath, **kwargs)
            return imagePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __depictOne(self, oeMol, imagePath, **kwargs):
        """Single

        Args:
            oeMol (object): instance of an OE graph molecule
            imagePath (string): file path for image

        Returns:
            bool: True for success or False otherwise
        """
        try:
            title = kwargs.get("title", None)
            oed = OeDepict()
            oed.setMolTitleList([("Target", oeMol, title)])

            # ---
            bondDisplayWidth = 10.0
            numAtoms = oeMol.NumAtoms()
            if numAtoms > 100 and numAtoms <= 200:
                bondDisplayWidth = 6.0
            elif numAtoms > 200:
                bondDisplayWidth = 4.0
            # ---
            oed.setDisplayOptions(
                imageSizeX=kwargs.get("imageSizeX", 2500),
                imageSizeY=kwargs.get("imageSizeX", 2500),
                labelAtomName=kwargs.get("labelAtomName", False),
                labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True),
                labelAtomIndex=kwargs.get("labelAtomIndex", False),
                labelBondIndex=kwargs.get("labelBondIndex", False),
                labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True),
                cellBorders=kwargs.get("cellBorders", True),
                bondDisplayWidth=bondDisplayWidth,
            )
            oed.setGridOptions(rows=1, cols=1, cellBorders=False)
            oed.prepare()
            oed.write(imagePath)
            self.__imageCount += 1
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def status(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6,
                    unitS)
        endTime = time.time()
        logger.info("Status at %s (up %.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def alignMoleculePair(self,
                          refIdentifier,
                          refIdentifierType,
                          fitIdentifier,
                          fitIdentifierType,
                          imagePath=None,
                          **kwargs):
        """Create aligned depiction for a target molecule InChI, SMILES descriptors or PDB identifier."""
        try:
            imagePath = imagePath if imagePath else self.__makeImagePath()
            oeio = OeIoUtils()
            ccsw = ChemCompSearchWrapper()
            oesmP = ccsw.getSearchMoleculeProvider()
            # ---
            if refIdentifierType.lower() in ["smiles"]:
                oeMolRef = oeio.smilesToMol(refIdentifier)
            elif refIdentifierType.lower() in ["inchi"]:
                oeMolRef = oeio.inchiToMol(refIdentifier)
            elif refIdentifierType.lower() in ["identifierpdb"]:
                oeMolRef = oesmP.getMol(refIdentifier)
            #
            if fitIdentifierType.lower() in ["smiles"]:
                oeMolFit = oeio.smilesToMol(fitIdentifier)
            elif fitIdentifierType.lower() in ["inchi"]:
                oeMolFit = oeio.inchiToMol(fitIdentifier)
            elif fitIdentifierType.lower() in ["identifierpdb"]:
                oeMolFit = oesmP.getMol(fitIdentifier)
            # ---
            logger.info("oeMolRef atoms %r", oeMolRef.NumAtoms())
            logger.info("oeMolFit atoms %r", oeMolFit.NumAtoms())

            displayIdRef = "Ref"
            displayIdFit = "Fit"
            ok = self.__depictAlignedPair(oeMolRef, displayIdRef, oeMolFit,
                                          displayIdFit, imagePath, **kwargs)
            return imagePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __depictAlignedPair(self, oeMolRef, displayIdRef, oeMolFit,
                            displayIdFit, imagePath, **kwargs):
        """Depict pairwise MCSS alignment"""
        try:
            #
            oed = OeDepictMCSAlignPage()
            oed.setSearchType(sType="relaxed")
            #
            oed.setRefMol(oeMolRef, displayIdRef)
            oed.setFitMol(oeMolFit, displayIdFit)
            #
            # imagePath = self.__makeImagePath()
            # ---
            bondDisplayWidth = 10.0
            numAtomsRef = oeMolRef.NumAtoms()
            if numAtomsRef > 100 and numAtomsRef <= 200:
                bondDisplayWidth = 6.0
            elif numAtomsRef > 200:
                bondDisplayWidth = 4.0
            # ---
            oed.setDisplayOptions(
                imageSizeX=kwargs.get("imageSizeX", 2500),
                imageSizeY=kwargs.get("imageSizeX", 2500),
                labelAtomName=kwargs.get("labelAtomName", False),
                labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True),
                labelAtomIndex=kwargs.get("labelAtomIndex", False),
                labelBondIndex=kwargs.get("labelBondIndex", False),
                labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True),
                cellBorders=kwargs.get("cellBorders", True),
                bondDisplayWidth=bondDisplayWidth,
                highlightStyleFit=kwargs.get("highlightStyleFit",
                                             "ballAndStickInverse"),
            )
            #
            aML = oed.alignPair(imagePath=imagePath)
            logger.info("Aligned atom count %d", len(aML))
            #
            # self.assertGreater(len(aML), 1)
            # if aML:
            #    for (rCC, rAt, tCC, tAt) in aML:
            #        logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def toMolFile(self,
                  identifier,
                  identifierType,
                  molfilePath=None,
                  fmt="mol",
                  **kwargs):
        """Create molfile (fmt) from InChI, SMILES descriptors or PDB identifier."""
        try:
            molfilePath = molfilePath if molfilePath else self.__makeMolfilePath(
                fmt=fmt)
            oeio = OeIoUtils()
            if identifierType.lower() in ["smiles"]:
                oeMol = oeio.smilesToMol(identifier)
                oeMol.SetTitle("From SMILES")
            elif identifierType.lower() in ["inchi"]:
                oeMol = oeio.inchiToMol(identifier)
                oeMol.SetTitle("From InChI")
            elif identifierType.lower() in ["identifierpdb"]:
                ccsw = ChemCompSearchWrapper()
                oesmP = ccsw.getSearchMoleculeProvider()
                oeMol = oesmP.getMol(identifier)
            #
            ok = self.__toMolFile(oeMol, molfilePath, **kwargs)
            return molfilePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __toMolFile(self, oeMol, molfilePath, **kwargs):
        """Write the

        Args:
            oeMol (object): instance of an OE graph molecule
            molfilePath (string): file path for molfile (type determined by extension)

        Returns:
            bool: True for success or False otherwise
        """
        try:
            _ = kwargs
            oeio = OeIoUtils()
            oeio.write(molfilePath, oeMol, constantMol=True)
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __makeMolfilePath(self, fmt="mol"):
        imageDirPath = self.__configD[
            "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "."
        fileRotateIncrement = self.__configD[
            "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50
        ic = self.__imageCount % fileRotateIncrement
        molPath = os.path.join(imageDirPath, "molfile-%s.%s" % (ic, fmt))
        return molPath
Пример #6
0
class MarshalUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")
        self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "dictionaries",
                                               "vrpt_dictmap.json")
        self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "update-lists", "all-pdb-list")
        self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                          "MOCK_BIRD_CC_REPO", "0",
                                          "PRDCC_000010.cif")
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__pathSaveDictionaryFile = os.path.join(self.__workPath,
                                                     "mmcif_pdbx_v5_next.dic")
        self.__pathSaveJsonTestFile = os.path.join(self.__workPath,
                                                   "json-content.json")
        self.__pathSaveIndexFile = os.path.join(self.__workPath,
                                                "all-pdb-list")
        self.__pathSaveCifFile = os.path.join(self.__workPath,
                                              "cif-content.cif")
        #
        self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "sequence",
                                            "pdb_seq_prerelease.fasta")
        self.__pathSaveFastaFile = os.path.join(self.__workPath,
                                                "test-pre-release.fasta")
        #

        self.__urlTarget = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
        self.__urlTargetBad = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump-missing.tar.gz"
        #
        self.__mU = MarshalUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testReadWriteInParts(self):
        """Test the case reading and writing in parts."""
        try:
            lenL = 12013
            aL = [100, 200, 300, 400, 500]
            dL = [aL for ii in range(lenL)]
            numParts = 4
            sPath = os.path.join(self.__workPath, "list-m-data.json")
            ok = self.__mU.doExport(sPath,
                                    dL,
                                    numParts=numParts,
                                    fmt="json",
                                    indent=3)
            #
            self.assertTrue(ok)
            rL = self.__mU.doImport(sPath, numParts=numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rL))
            self.assertEqual(dL, rL)
            #
            lenD = 23411
            qD = OrderedDict([("a", 100), ("b", 100), ("c", 100)])
            dD = OrderedDict([(str(ii), qD) for ii in range(lenD)])
            numParts = 4
            sPath = os.path.join(self.__workPath, "dict-m-data.json")
            ok = self.__mU.doExport(sPath,
                                    dD,
                                    numParts=numParts,
                                    fmt="json",
                                    indent=3)
            self.assertTrue(ok)
            rD = self.__mU.doImport(sPath, numParts=numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rD))
            self.assertEqual(dD, rD)
            rD = self.__mU.doImport(sPath, numParts=numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rD))
            self.assertEqual(dD, rD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadDictionaryFile(self):
        """Test the case read PDBx/mmCIF dictionary text file"""
        try:
            cL = self.__mU.doImport(self.__pathPdbxDictionaryFile,
                                    fmt="mmcif-dict")
            logger.debug("Dictionary container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadCifFile(self):
        """Test the case read PDBx/mmCIF text file"""
        try:
            cL = self.__mU.doImport(self.__pathCifFile, fmt="mmcif")
            logger.debug("Container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadListFile(self):
        """Test the case read list text file"""
        try:
            cL = self.__mU.doImport(self.__pathIndexFile, fmt="list")
            logger.debug("List length %d", len(cL))
            self.assertGreaterEqual(len(cL), 1000)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadJsonFile(self):
        """Test the case read JSON file"""
        try:
            rObj = self.__mU.doImport(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteDictionaryFiles(self):
        """Test the case read and write PDBx/mmCIF dictionary text file"""
        try:
            cL = self.__mU.doImport(self.__pathPdbxDictionaryFile,
                                    fmt="mmcif-dict")
            logger.debug("Dictionary container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
            ok = self.__mU.doExport(self.__pathSaveDictionaryFile,
                                    cL,
                                    fmt="mmcif-dict")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteCifFile(self):
        """Test the case read and write PDBx/mmCIF text file"""
        try:
            cL = self.__mU.doImport(self.__pathCifFile, fmt="mmcif")
            logger.debug("Container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
            ok = self.__mU.doExport(self.__pathSaveCifFile, cL, fmt="mmcif")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteJsonFile(self):
        """Test the case read and write JSON file"""
        try:
            rObj = self.__mU.doImport(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
            ok = self.__mU.doExport(self.__pathSaveJsonTestFile,
                                    rObj,
                                    fmt="json")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteListFile(self):
        """Test the case read and write list text file"""
        try:
            cL = self.__mU.doImport(self.__pathIndexFile, fmt="list")
            logger.debug("List element %r length %d", cL[0], len(cL))
            count = 0
            for cV in cL:
                fields = cV.split()
                count += len(fields)
            _ = count
            self.assertGreaterEqual(len(cL), 1000)
            ok = self.__mU.doExport(self.__pathSaveIndexFile, cL, fmt="list")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteFastaFile(self):
        """Test the case read and write FASTA sequence file"""
        try:
            sD = self.__mU.doImport(self.__pathFastaFile,
                                    fmt="fasta",
                                    commentStyle="prerelease")
            logger.debug("Sequence length %d", len(sD))
            self.assertGreaterEqual(len(sD), 500)
            ok = self.__mU.doExport(self.__pathSaveFastaFile, sD, fmt="fasta")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadUrlTarfile(self):
        """Test the case to read URL target and extract a member"""
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            _, fn = os.path.split(self.__urlTarget)
            #
            nmL = mU.doImport(self.__urlTarget,
                              fmt="tdd",
                              rowFormat="list",
                              tarMember="names.dmp")
            self.assertGreater(len(nmL), 2000000)
            logger.info("Names %d", len(nmL))
            ndL = mU.doImport(os.path.join(self.__workPath, fn),
                              fmt="tdd",
                              rowFormat="list",
                              tarMember="nodes.dmp")
            self.assertGreater(len(ndL), 2000000)
            logger.info("Nodes %d", len(ndL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadUrlTddfile(self):
        """Test the case to read URL target of a tdd"""
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            version = "2.07-2019-07-23"
            urlTarget = "http://scop.berkeley.edu/downloads/update"
            encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii"
            fn = "dir.des.scope.%s.txt" % version
            url = os.path.join(urlTarget, fn)
            logger.info("Fetch url %r", url)
            desL = mU.doImport(url,
                               fmt="tdd",
                               rowFormat="list",
                               uncomment=True,
                               encoding=encoding)
            logger.info("Fetched URL is %s len %d", url, len(desL))
            self.assertGreater(len(desL), 100)
            logger.info("Lines %d", len(desL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadUrlTarfileFail(self):
        """Test the case to read URL target and extract a member (failing case)"""
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            rL = mU.doImport(self.__urlTargetBad,
                             fmt="tdd",
                             rowFormat="list",
                             tarMember="names.dmp")
            logger.info("Return is %r", rL)
            self.assertEqual(len(rL), 0)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class PharosTargetProvider(StashableBase):
    """Accessors for Pharos target assignments."""

    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirName = "Pharos-targets"
        super(PharosTargetProvider, self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        reloadDb = kwargs.get("reloadDb", False)
        fromDb = kwargs.get("fromDb", False)
        useCache = kwargs.get("useCache", False)
        pharosDumpUrl = kwargs.get("pharosDumpUrl", None)
        mysqlUser = kwargs.get("mysqlUser", None)
        mysqlPassword = kwargs.get("mysqlPassword", None)
        self.__version = None
        if reloadDb or fromDb:
            self.__reload(self.__dirPath, reloadDb=reloadDb, fromDb=fromDb, useCache=useCache, pharosDumpUrl=pharosDumpUrl, mysqlUser=mysqlUser, mysqlPassword=mysqlPassword)
        #

    def testCache(self):
        return True

    def getVersion(self):
        return self.__version

    def __reload(self, dirPath, reloadDb=False, fromDb=False, useCache=False, pharosDumpUrl=None, mysqlUser=None, mysqlPassword=None):
        startTime = time.time()
        pharosSelectedTables = ["drug_activity", "cmpd_activity", "target", "protein", "t2tc"]
        pharosDumpUrl = pharosDumpUrl if pharosDumpUrl else "http://juniper.health.unm.edu/tcrd/download/latest.sql.gz"
        pharosReadmeUrl = "http://juniper.health.unm.edu/tcrd/download/latest.README"
        ok = False
        fU = FileUtil()
        pharosDumpFileName = fU.getFileName(pharosDumpUrl)
        pharosDumpPath = os.path.join(dirPath, pharosDumpFileName)
        pharosUpdatePath = os.path.join(dirPath, "pharos-update.sql")
        pharosReadmePath = os.path.join(dirPath, "pharos-readme.txt")
        logPath = os.path.join(dirPath, "pharosLoad.log")
        #
        fU.mkdir(dirPath)
        #

        exU = ExecUtils()
        #
        if reloadDb:
            logger.info("useCache %r pharosDumpPath %r", useCache, pharosDumpPath)
            if useCache and self.__mU.exists(pharosDumpPath):
                ok = True
            else:
                logger.info("Fetching url %s path %s", pharosDumpUrl, pharosDumpPath)
                ok1 = fU.get(pharosDumpUrl, pharosDumpPath)
                ok2 = fU.get(pharosReadmeUrl, pharosReadmePath)
                logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1 and ok2, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(pharosReadmePath, fmt="list")
            self.__version = readmeLines[0].split(" ")[1][1:] if readmeLines else "6"
            # ---
            logger.info("Filtering SQL dump %r for selected tables %r", pharosDumpFileName, pharosSelectedTables)
            doWrite = True
            # Note: the pharos dump file latest.sql.gz is not gzipped
            with open(pharosDumpPath, "r", encoding="utf-8") as ifh, open(pharosUpdatePath, "w", encoding="utf-8") as ofh:
                for line in ifh:
                    if line.startswith("-- Table structure for table"):
                        tN = line.split(" ")[-1][1:-2]
                        doWrite = True if tN in pharosSelectedTables else False
                    if doWrite:
                        ofh.write(line)
            # ---
            ok = exU.run(
                "mysql",
                execArgList=["-v", "-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "create database if not exists tcrd6;"],
                outPath=logPath,
                outAppend=False,
                timeOut=None,
            )
            # ok = exU.run(
            #     "mysql",
            #     execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "tcrd6"],
            #     outPath=logPath,
            #     inpPath=pharosDumpPath,
            #     outAppend=True,
            #     timeOut=None,
            # )
            shellCmd = 'trap "" SIGHUP SIGINT SIGTERM; nohup mysql -u %s --password=%s tcrd6 < %s >& %s' % (mysqlUser, mysqlPassword, pharosUpdatePath, logPath)
            ok = exU.runShell(
                shellCmd,
                outPath=None,
                inpPath=None,
                outAppend=True,
                timeOut=None,
            )
            logger.info("SQL dump restore status %r", ok)
        # --
        if fromDb:
            for tbl in pharosSelectedTables:
                outPath = os.path.join(dirPath, "%s.tdd" % tbl)
                # if useCache and self.__mU.exists(outPath):
                #   continue
                ok = exU.run(
                    "mysql",
                    execArgList=["-u", mysqlUser, "--password=%s" % mysqlPassword, "-e", "use tcrd6; select * from %s;" % tbl],
                    outPath=outPath,
                    outAppend=False,
                    timeOut=None,
                    suppressStderr=True,
                )
                logger.info("SQL table %s export status %r", tbl, ok)
        return ok

    def exportProteinFasta(self, fastaPath, taxonPath, addTaxonomy=False):
        try:
            proteinFilePath = os.path.join(self.__dirPath, "protein.tdd")
            pDL = self.__mU.doImport(proteinFilePath, fmt="tdd", rowFormat="dict")
            fD = {}
            taxonL = []
            if addTaxonomy:
                umP = UniProtIdMappingProvider(self.__cachePath)
                umP.reload(useCache=True)
                #
                for pD in pDL:
                    unpId = pD["uniprot"]
                    proteinId = pD["id"]
                    seq = pD["seq"]
                    taxId = umP.getMappedId(unpId, mapName="NCBI-taxon")
                    taxId = taxId if taxId else "-1"
                    cD = {"sequence": seq, "uniprotId": unpId, "proteinId": proteinId, "taxId": taxId}
                    seqId = ""
                    cL = []
                    for k, v in cD.items():
                        if k in ["sequence"]:
                            continue
                        cL.append(str(v))
                        cL.append(str(k))
                    seqId = "|".join(cL)
                    fD[seqId] = cD
                    taxonL.append("%s\t%s" % (seqId, taxId))
                ok = self.__mU.doExport(taxonPath, taxonL, fmt="list")
            else:
                for pD in pDL:
                    unpId = pD["uniprot"]
                    proteinId = pD["id"]
                    seq = pD["seq"]
                    cD = {"sequence": seq, "uniprotId": unpId, "proteinId": proteinId}
                    seqId = ""
                    cL = []
                    for k, v in cD.items():
                        if k in ["sequence"]:
                            continue
                        cL.append(str(v))
                        cL.append(str(k))
                    seqId = "|".join(cL)
                    fD[seqId] = cD
            #
            logger.info("Writing %d pharos targets to %s", len(fD), fastaPath)
            ok = self.__mU.doExport(fastaPath, fD, fmt="fasta", makeComment=True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok
class RepositoryProvider(object):
    def __init__(self,
                 cfgOb,
                 cachePath=None,
                 numProc=8,
                 fileLimit=None,
                 verbose=False):
        self.__fileLimit = fileLimit
        self.__numProc = numProc
        self.__verbose = verbose
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__topCachePath = cachePath if cachePath else "."
        self.__cachePath = os.path.join(
            self.__topCachePath,
            self.__cfgOb.get("REPO_UTIL_CACHE_DIR",
                             sectionName=self.__configName))
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        #
        self.__ccPathD = None
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

    def getLocatorObjList(self,
                          contentType,
                          inputPathList=None,
                          mergeContentTypes=None,
                          excludeIds=None):
        """Convenience method to get the data path list for the input repository content type.

        Args:
            contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...)
            inputPathList (list, optional): path list that will be returned if provided.
            mergeContentTypes (list, optional): repository content types to combined with the
                                primary content type.
            excludeIds (list or dict): exclude any locators for idCodes in this list or dictionary

        Returns:
            Obj list: data file paths or tuple of file paths

        """
        inputPathList = inputPathList if inputPathList else []
        if inputPathList:
            return self.getLocatorObjListWithInput(
                contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
        #
        if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [
                "pdbx", "pdbx_core"
        ]:
            dictPath = os.path.join(
                self.__topCachePath,
                self.__cfgOb.get(
                    "DICTIONARY_CACHE_DIR",
                    sectionName=self.__cfgOb.getDefaultSectionName()))
            os.environ["_RP_DICT_PATH_"] = dictPath
            locatorList = self.getEntryLocatorObjList(
                mergeContentTypes=mergeContentTypes)
        else:
            locatorList = self.__getLocatorList(contentType,
                                                inputPathList=inputPathList)
        #
        if excludeIds:
            fL = []
            for locator in locatorList:
                if isinstance(locator, str):
                    pth = locator
                else:
                    pth = locator[0]["locator"]
                #
                idCode = self.__getIdcodeFromLocatorPath(contentType, pth)
                if idCode in excludeIds:
                    continue
                fL.append(locator)
            locatorList = fL

        return locatorList

    def getLocatorObjListWithInput(self,
                                   contentType,
                                   inputPathList=None,
                                   mergeContentTypes=None):
        """Convenience method to get the data path list for the input repository content type.

        Args:
            contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...)
            inputPathList (list, optional): path list that will be returned if provided.
            mergeContentTypes (list, optional): repository content types to combined with the
                                primary content type.

        Returns:
            Obj list: data file paths or tuple of file paths

        """
        inputPathList = inputPathList if inputPathList else []
        locatorList = self.__getLocatorList(contentType,
                                            inputPathList=inputPathList)
        # JDW move the following to config
        if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [
                "pdbx", "pdbx_core"
        ]:
            dictPath = os.path.join(
                self.__topCachePath,
                self.__cfgOb.get(
                    "DICTIONARY_CACHE_DIR",
                    sectionName=self.__cfgOb.getDefaultSectionName()))
            os.environ["_RP_DICT_PATH_"] = dictPath
            #
            locObjL = []
            for locator in locatorList:
                if isinstance(locator, str):
                    kwD = HashableDict({})
                    oL = [
                        HashableDict({
                            "locator": locator,
                            "fmt": "mmcif",
                            "kwargs": kwD
                        })
                    ]
                    for mergeContentType in mergeContentTypes:
                        _, fn = os.path.split(locator)
                        idCode = fn[:4] if fn and len(fn) >= 8 else None
                        mergeLocator = self.__getLocator(
                            mergeContentType, idCode,
                            checkExists=True) if idCode else None
                        if mergeLocator:
                            # kwD = HashableDict({"marshalHelper": vrd.toCif})
                            kwD = HashableDict({"marshalHelper": toCifWrapper})
                            oL.append(
                                HashableDict({
                                    "locator": mergeLocator,
                                    "fmt": "xml",
                                    "kwargs": kwD
                                }))
                    lObj = tuple(oL)
                else:
                    logger.error("Unexpected output locator type %r", locator)
                    lObj = locator
                locObjL.append(lObj)
            #
            locatorList = locObjL
        # -
        return locatorList

    def getContainerList(self, locatorObjList):
        """Return the data container list obtained by parsing the input locator object list."""
        cL = []
        for locatorObj in locatorObjList:
            myContainerList = self.__mergeContainers(locatorObj,
                                                     fmt="mmcif",
                                                     mergeTarget=0)
            for cA in myContainerList:
                cL.append(cA)
        return cL

    def __mergeContainers(self, locatorObj, fmt="mmcif", mergeTarget=0):
        """Consolidate content in auxiliary files locatorObj[1:] into
        locatorObj[0] container index 'mergeTarget'.

        """
        #
        cL = []
        try:
            if isinstance(locatorObj, str):
                cL = self.__mU.doImport(locatorObj, fmt=fmt)
                return cL if cL else []
            elif isinstance(locatorObj, (list, tuple)) and locatorObj:
                dD = locatorObj[0]
                kw = dD["kwargs"]
                cL = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw)
                if cL:
                    for dD in locatorObj[1:]:
                        kw = dD["kwargs"]
                        rObj = self.__mU.doImport(dD["locator"],
                                                  fmt=dD["fmt"],
                                                  **kw)
                        mergeL = rObj if rObj else []
                        for mc in mergeL:
                            cL[mergeTarget].merge(mc)
                #
                return cL
            else:
                return []
        except Exception as e:
            logger.exception("Failing for %r with %s", locatorObj, str(e))

        return cL

    def getLocatorsFromPaths(self, locatorObjList, pathList, locatorIndex=0):
        """Return locator objects with paths (locatorObjIndex) matching the input pathList."""
        # index the input locatorObjList
        rL = []
        try:
            if locatorObjList and isinstance(locatorObjList[0], str):
                return pathList
            #
            locIdx = {}
            for ii, locatorObj in enumerate(locatorObjList):
                if "locator" in locatorObj[locatorIndex]:
                    locIdx[locatorObj[locatorIndex]["locator"]] = ii
            #
            for pth in pathList:
                jj = locIdx[pth] if pth in locIdx else None
                if jj is not None:
                    rL.append(locatorObjList[jj])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return rL

    def getLocatorIdcodes(self, contentType, locatorObjList, locatorIndex=0):
        try:

            if locatorObjList and isinstance(locatorObjList[0], str):
                return [
                    self.__getIdcodeFromLocatorPath(contentType, pth)
                    for pth in locatorObjList
                ]
            else:
                return [
                    self.__getIdcodeFromLocatorPath(
                        contentType, locatorObj[locatorIndex]["locator"])
                    for locatorObj in locatorObjList
                ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return []

    def getLocatorPaths(self, locatorObjList, locatorIndex=0):
        try:
            if locatorObjList and isinstance(locatorObjList[0], str):
                return locatorObjList
            else:
                return [
                    locatorObj[locatorIndex]["locator"]
                    for locatorObj in locatorObjList
                ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return []

    def __getLocatorList(self, contentType, inputPathList=None):
        """Internal convenience method to return repository path list by content type:"""
        outputPathList = []
        inputPathList = inputPathList if inputPathList else []
        try:
            if contentType in ["bird", "bird_core"]:
                outputPathList = inputPathList if inputPathList else self.getBirdPathList(
                )
            elif contentType == "bird_family":
                outputPathList = inputPathList if inputPathList else self.getBirdFamilyPathList(
                )
            elif contentType in ["chem_comp"]:
                outputPathList = inputPathList if inputPathList else self.getChemCompPathList(
                )
            elif contentType in ["bird_chem_comp"]:
                outputPathList = inputPathList if inputPathList else self.getBirdChemCompPathList(
                )
            elif contentType in ["pdbx", "pdbx_core"]:
                outputPathList = inputPathList if inputPathList else self.getEntryPathList(
                )
            elif contentType in [
                    "chem_comp_core", "bird_consolidated",
                    "bird_chem_comp_core"
            ]:
                outputPathList = inputPathList if inputPathList else self.mergeBirdAndChemCompRefData(
                )
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                outputPathList = inputPathList if inputPathList else self.getIhmDevPathList(
                )
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                outputPathList = inputPathList if inputPathList else []
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        if self.__fileLimit:
            outputPathList = outputPathList[:self.__fileLimit]

        return sorted(outputPathList)

    def __getLocator(self,
                     contentType,
                     idCode,
                     version="v1-0",
                     checkExists=False):
        """Convenience method to return repository path for a content type and cardinal identifier."""
        pth = None
        try:
            idCodel = idCode.lower()
            if contentType == "bird":
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType == "bird_family":
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType in ["chem_comp", "chem_comp_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[0], idCode, idCode + ".cif")
            elif contentType in ["bird_chem_comp"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType in ["pdbx", "pdbx_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCodel[1:3], idCodel + ".cif.gz")
            elif contentType in ["bird_consolidated", "bird_chem_comp_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode + ".cif")
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                pth = os.path.join(self.__getRepoTopPath(contentType), idCode,
                                   idCode + "_model_%s.cif.gz" % version)
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                pass
            elif contentType in ["vrpt"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCodel[1:3], idCodel,
                                   idCodel + "_validation.xml.gz")
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        if checkExists:
            pth = pth if self.__mU.exists(pth) else None
        return pth

    def __getIdcodeFromLocatorPath(self, contentType, pth):
        """Convenience method to return the idcode from the locator path."""
        idCode = None
        try:
            bn = os.path.basename(pth)
            if contentType in [
                    "pdbx", "pdbx_core", "bird", "bird_family", "chem_comp",
                    "chem_comp_core", "bird_consolidated",
                    "bird_chem_comp_core"
            ]:
                idCode = bn.split(".")[0]
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                tC = bn.split(".")[0]
                idCode = "_".join(tC.split("_")[:2])
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                idCode = None
            elif contentType in ["vrpt"]:
                tC = bn.split(".")[0]
                idCode = tC.split("_")[0]
            else:
                logger.warning("Unsupported contentType %s", contentType)
            idCode = idCode.upper() if idCode else None
        except Exception as e:
            logger.exception("Failing for %r %r with %s", contentType, pth,
                             str(e))
        return idCode

    def __getRepoTopPath(self, contentType):
        """Convenience method to return repository top path from configuration data."""
        pth = None
        try:
            if contentType == "bird":
                pth = self.__cfgOb.getPath("BIRD_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType == "bird_family":
                pth = self.__cfgOb.getPath("BIRD_FAMILY_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["chem_comp", "chem_comp_core"]:
                pth = self.__cfgOb.getPath("CHEM_COMP_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["bird_chem_comp"]:
                pth = self.__cfgOb.getPath("BIRD_CHEM_COMP_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["pdbx", "pdbx_core"]:
                pth = self.__cfgOb.getPath("PDBX_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["bird_consolidated", "bird_chem_comp_core"]:
                pth = self.__cachePath
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                pth = self.__cfgOb.getPath("IHM_DEV_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                pass
            elif contentType in ["vrpt"]:
                pth = self.__cfgOb.getEnvValue("VRPT_REPO_PATH_ENV",
                                               sectionName=self.__configName,
                                               default=None)
                if pth is None:
                    pth = self.__cfgOb.getPath("VRPT_REPO_PATH",
                                               sectionName=self.__configName)
                else:
                    logger.debug(
                        "Using validation report path from environment assignment %s",
                        pth)
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return pth

    def _chemCompPathWorker(self, dataList, procName, optionsD, workingDir):
        """Return the list of chemical component definition file paths in the current repository."""
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        pathList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in os.walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.endswith(".cif") and len(name) <= 7:
                        pathList.append(os.path.join(root, name))
        return dataList, pathList, []

    def getChemCompPathList(self):
        return self.__getChemCompPathList(self.__getRepoTopPath("chem_comp"),
                                          numProc=self.__numProc)

    def __getChemCompPathList(self, topRepoPath, numProc=8):
        """Get the path list for the chemical component definition repository"""
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            dataList = [a for a in dataS]
            optD = {}
            optD["topRepoPath"] = topRepoPath
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="_chemCompPathWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                         endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def _entryLocatorObjWithMergeWorker(self, dataList, procName, optionsD,
                                        workingDir):
        """Return the list of entry locator objects including merge content in the current repository."""
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        mergeContentTypes = optionsD["mergeContentTypes"]
        locatorObjList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in os.walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for fn in files:
                    if (fn.endswith(".cif.gz")
                            and len(fn) == 11) or (fn.endswith(".cif")
                                                   and len(fn) == 8):
                        locator = os.path.join(root, fn)
                        kwD = HashableDict({})
                        oL = [
                            HashableDict({
                                "locator": locator,
                                "fmt": "mmcif",
                                "kwargs": kwD
                            })
                        ]
                        for mergeContentType in mergeContentTypes:
                            idCode = fn[:4] if fn and len(fn) >= 8 else None
                            mergeLocator = self.__getLocator(
                                mergeContentType, idCode,
                                checkExists=True) if idCode else None
                            if mergeLocator:
                                kwD = HashableDict(
                                    {"marshalHelper": toCifWrapper})
                                oL.append(
                                    HashableDict({
                                        "locator": mergeLocator,
                                        "fmt": "xml",
                                        "kwargs": kwD
                                    }))
                        lObj = tuple(oL)
                        locatorObjList.append(lObj)
        return dataList, locatorObjList, []

    def getEntryLocatorObjList(self, mergeContentTypes=None):
        return self.__getEntryLocatorObjList(
            self.__getRepoTopPath("pdbx"),
            numProc=self.__numProc,
            mergeContentTypes=mergeContentTypes)

    def __getEntryLocatorObjList(self,
                                 topRepoPath,
                                 numProc=8,
                                 mergeContentTypes=None):
        """Get the path list for structure entries in the input repository"""
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataList = []
            anL = "abcdefghijklmnopqrstuvwxyz0123456789"
            for a1 in anL:
                for a2 in anL:
                    hc = a1 + a2
                    dataList.append(hc)
                    hc = a2 + a1
                    dataList.append(hc)
            dataList = list(set(dataList))
            #
            optD = {}
            optD["topRepoPath"] = topRepoPath
            optD["mergeContentTypes"] = mergeContentTypes
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self,
                    workerMethod="_entryLocatorObjWithMergeWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Locator object list length %d  in %.4f seconds",
                         len(pathList), endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def _entryPathWorker(self, dataList, procName, optionsD, workingDir):
        """Return the list of entry file paths in the current repository."""
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        pathList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in os.walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if (name.endswith(".cif.gz")
                            and len(name) == 11) or (name.endswith(".cif")
                                                     and len(name) == 8):
                        pathList.append(os.path.join(root, name))
        return dataList, pathList, []

    def getEntryPathList(self):
        return self.__getEntryPathList(self.__getRepoTopPath("pdbx"),
                                       numProc=self.__numProc)

    def __getEntryPathList(self, topRepoPath, numProc=8):
        """Get the path list for structure entries in the input repository"""
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataList = []
            anL = "abcdefghijklmnopqrstuvwxyz0123456789"
            for a1 in anL:
                for a2 in anL:
                    hc = a1 + a2
                    dataList.append(hc)
                    hc = a2 + a1
                    dataList.append(hc)
            dataList = list(set(dataList))
            #
            optD = {}
            optD["topRepoPath"] = topRepoPath
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="_entryPathWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                         endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def getBirdPathList(self):
        return self.__getBirdPathList(self.__getRepoTopPath("bird"))

    def __getBirdPathList(self, topRepoPath):
        """Return the list of definition file paths in the current repository.

        List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PRD_") and name.endswith(
                            ".cif") and len(name) <= 14:
                        pth = os.path.join(root, name)
                        sd[int(name[4:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def getBirdFamilyPathList(self):
        return self.__getBirdFamilyPathList(
            self.__getRepoTopPath("bird_family"))

    def __getBirdFamilyPathList(self, topRepoPath):
        """Return the list of definition file paths in the current repository.

        List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("FAM_") and name.endswith(
                            ".cif") and len(name) <= 14:
                        pth = os.path.join(root, name)
                        sd[int(name[4:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def getBirdChemCompPathList(self):
        return self.__getBirdChemCompPathList(
            self.__getRepoTopPath("bird_chem_comp"))

    def __getBirdChemCompPathList(self, topRepoPath):
        """Return the list of definition file paths in the current repository.

        List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PRDCC_") and name.endswith(
                            ".cif") and len(name) <= 16:
                        pth = os.path.join(root, name)
                        sd[int(name[6:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def __applyFileLimit(self, pathList):
        logger.debug("Length of file path list %d (limit %r)", len(pathList),
                     self.__fileLimit)
        if self.__fileLimit:
            return pathList[:self.__fileLimit]
        else:
            return pathList

    def __buildFamilyIndex(self):
        """Using information from the PRD family definition:
        #
        loop_
        _pdbx_reference_molecule_list.family_prd_id
        _pdbx_reference_molecule_list.prd_id
            FAM_000010 PRD_000041
            FAM_000010 PRD_000042
            FAM_000010 PRD_000043
            FAM_000010 PRD_000044
            FAM_000010 PRD_000048
            FAM_000010 PRD_000049
            FAM_000010 PRD_000051
        #
        """
        prdD = {}
        try:
            pthL = self.__getLocatorList("bird_family")
            for pth in pthL:
                containerL = self.__mU.doImport(pth, fmt="mmcif")
                for container in containerL:
                    catName = "pdbx_reference_molecule_list"
                    if container.exists(catName):
                        catObj = container.getObj(catName)
                        for ii in range(catObj.getRowCount()):
                            familyPrdId = catObj.getValue(
                                attributeName="family_prd_id", rowIndex=ii)
                            prdId = catObj.getValue(attributeName="prd_id",
                                                    rowIndex=ii)
                            if prdId in prdD:
                                logger.debug(
                                    "duplicate prdId in family index %s %s",
                                    prdId, familyPrdId)
                            prdD[prdId] = {
                                "familyPrdId": familyPrdId,
                                "c": container
                            }
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return prdD

    def __buildBirdCcIndex(self):
        """Using information from the PRD pdbx_reference_molecule category to
        index the BIRDs corresponding small molecule correspondences

        """
        prdD = {}
        ccPathD = {}
        prdStatusD = {}
        try:
            ccPathL = self.__getLocatorList("chem_comp")
            ccPathD = {}
            for ccPath in ccPathL:
                _, fn = os.path.split(ccPath)
                ccId, _ = os.path.splitext(fn)
                ccPathD[ccId] = ccPath
            logger.info("Chemical component path list (%d)", len(ccPathD))
            pthL = self.__getLocatorList("bird")
            logger.info("BIRD path list (%d)", len(pthL))
            for pth in pthL:
                containerL = self.__mU.doImport(pth, fmt="mmcif")
                for container in containerL:
                    catName = "pdbx_reference_molecule"
                    if container.exists(catName):
                        catObj = container.getObj(catName)
                        ii = 0
                        prdId = catObj.getValue(attributeName="prd_id",
                                                rowIndex=ii)
                        relStatus = catObj.getValue(
                            attributeName="release_status", rowIndex=ii)
                        prdStatusD[prdId] = relStatus
                        if relStatus != "REL":
                            continue
                        prdRepType = catObj.getValue(
                            attributeName="represent_as", rowIndex=ii)
                        logger.debug("represent as %r", prdRepType)
                        if prdRepType in ["single molecule"]:
                            ccId = catObj.getValueOrDefault(
                                attributeName="chem_comp_id",
                                rowIndex=ii,
                                defaultValue=None)
                            # prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii)
                            logger.debug("mapping prdId %r ccId %r", prdId,
                                         ccId)
                            if ccId and ccId in ccPathD:
                                prdD[prdId] = {
                                    "ccId": ccId,
                                    "ccPath": ccPathD[ccId]
                                }
                                ccPathD[ccPathD[ccId]] = {
                                    "ccId": ccId,
                                    "prdId": prdId
                                }
                            else:
                                logger.error("Bad ccId %r for BIRD %r", ccId,
                                             prdId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        logger.info(
            "Candidate Chemical Components (%d) BIRDS (%d) BIRD status details (%d)",
            len(prdD), len(ccPathD), len(prdStatusD))
        return prdD, ccPathD, prdStatusD

    # -
    def mergeBirdAndChemCompRefData(self):
        prdSmallMolCcD, ccPathD, prdStatusD = self.__buildBirdCcIndex()
        logger.info("PRD to CCD index length %d CCD map path length %d",
                    len(prdSmallMolCcD), len(ccPathD))
        outputPathList = self.mergeBirdRefData(prdSmallMolCcD, prdStatusD)
        ccOutputPathList = [
            pth for pth in self.getChemCompPathList() if pth not in ccPathD
        ]
        outputPathList.extend(ccOutputPathList)
        return outputPathList

    def mergeBirdRefData(self, prdSmallMolCcD, prdStatusD):
        """Consolidate all of the bird reference data in a single container.

        If the BIRD is a 'small molecule' type then also merge with the associated CC definition.

        Store the merged data in the REPO_UTIL cache path and ...

        Return a path list for the consolidated data files -

        """
        outPathList = []
        try:
            birdPathList = self.__getLocatorList("bird")
            birdPathD = {}
            for birdPath in birdPathList:
                _, fn = os.path.split(birdPath)
                prdId, _ = os.path.splitext(fn)
                birdPathD[prdId] = birdPath
            #
            logger.info("BIRD path length %d", len(birdPathD))
            logger.debug("BIRD keys %r", list(birdPathD.keys()))
            birdCcPathList = self.__getLocatorList("bird_chem_comp")
            birdCcPathD = {}
            for birdCcPath in birdCcPathList:
                _, fn = os.path.split(birdCcPath)
                prdCcId, _ = os.path.splitext(fn)
                prdId = "PRD_" + prdCcId[6:]
                birdCcPathD[prdId] = birdCcPath
            #
            logger.info("BIRDCC path length %d", len(birdCcPathD))
            logger.debug("BIRD CC keys %r", list(birdCcPathD.keys()))
            fD = self.__buildFamilyIndex()
            logger.info("BIRD Family index length %d", len(fD))
            logger.debug("Family index keys %r", list(fD.keys()))
            logger.info("PRD to CCD small mol index length %d",
                        len(prdSmallMolCcD))
            #
            iSkip = 0
            for prdId in birdPathD:
                if prdId in prdStatusD and prdStatusD[prdId] != "REL":
                    logger.debug("Skipping BIRD with non-REL status %s", prdId)
                    iSkip += 1
                    continue
                fp = os.path.join(self.__cachePath, prdId + ".cif")
                logger.debug("Export cache path is %r", fp)
                #
                pth2 = birdPathD[prdId]
                cL = self.__mU.doImport(pth2, fmt="mmcif")
                cFull = cL[0]
                logger.debug("Got Bird %r", cFull.getName())
                #
                #
                ccBird = None
                ccD = None
                if prdId in prdSmallMolCcD:
                    pthCc = prdSmallMolCcD[prdId]["ccPath"]
                    cL = self.__mU.doImport(pthCc, fmt="mmcif")
                    ccD = cL[0]
                    logger.debug("Got corresponding CCD %r", ccD.getName())
                elif prdId in birdCcPathD:
                    pth1 = birdCcPathD[prdId]
                    c1L = self.__mU.doImport(pth1, fmt="mmcif")
                    ccBird = c1L[0]
                    logger.debug("Got ccBird %r", ccBird.getName())
                    #
                cFam = None
                if prdId in fD:
                    cFam = fD[prdId]["c"]
                    logger.debug("Got cFam %r", cFam.getName())
                #
                if ccD:
                    for catName in ccD.getObjNameList():
                        cFull.append(ccD.getObj(catName))
                #
                if ccBird:
                    for catName in ccBird.getObjNameList():
                        cFull.append(ccBird.getObj(catName))
                if cFam:
                    for catName in cFam.getObjNameList():
                        cFull.append(cFam.getObj(catName))
                #
                self.__mU.doExport(fp, [cFull], fmt="mmcif")
                outPathList.append(fp)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        logger.info(
            "Merged BIRD/Family/CC path length %d (skipped non-released %d)",
            len(outPathList), iSkip)
        return outPathList
        #

    def __exportConfig(self, container):
        """
        - CATEGORY_NAME: diffrn_detector
          ATTRIBUTE_NAME_LIST:
              - pdbx_frequency
        - CATEGORY_NAME: pdbx_serial_crystallography_measurement
          ATTRIBUTE_NAME_LIST:
              - diffrn_id
              - pulse_energy
              - pulse_duration
              - xfel_pulse_repetition_rate
        """
        for catName in container.getObjNameList():
            cObj = container.getObj(catName)
            print("- CATEGORY_NAME: %s" % catName)
            print("  ATTRIBUTE_NAME_LIST:")
            for atName in cObj.getAttributeList():
                print("       - %s" % atName)
        return True

    def getIhmDevPathList(self):
        return self.__getIhmDevPathList(self.__getRepoTopPath("ihm_dev"))

    def __getIhmDevPathList(self, topRepoPath):
        """Return the list of I/HM entries in the current repository.

        File name template is: PDBDEV_0000 0020_model_v1-0.cif.gz

        List is ordered in increasing PRDDEV numerical code.
        """
        pathList = []
        logger.debug("Searching path %r", topRepoPath)
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PDBDEV_") and name.endswith(
                            ".cif.gz") and len(name) <= 50:
                        pth = os.path.join(root, name)
                        sd[int(name[7:15])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing search in %r with %s", topRepoPath,
                             str(e))
        #
        return self.__applyFileLimit(pathList)
Пример #9
0
class ChemRefMappingProvider(StashableBase):
    """Accessors for chemical reference identifier mapping data."""
    def __init__(self, cachePath, useCache=True):
        #
        self.__cachePath = cachePath
        self.__useCache = useCache
        self.__dirName = "chemref-mapping"
        super(ChemRefMappingProvider, self).__init__(self.__cachePath,
                                                     [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__rD = {}
        self.__mapD = self.__reload(self.__dirPath, useCache)
        #

    def testCache(self, minCount=0):
        logger.info(
            "Mapping count %d",
            len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0)
        if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(
                self.__mapD["mapping"]) >= minCount:
            return True
        else:
            return False

    def getReferenceIds(self, referenceResourceName, localId):
        """Get the identifiers in the reference resource corresponding to input local
        identifiers (Chemical Component or BIRD).

        Args:
            referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
            localId (str): local identifier for a Chemical Component or BIRD definition

        Returns:
            list: list of reference identifiers
        """
        if not self.__rD:
            for rN, forwardD in self.__mapD["mapping"].items():
                # {refId :[lId, lId, ...], ...}
                reverseD = {}
                for refId, rcsbIdL in forwardD.items():
                    for rId in rcsbIdL:
                        reverseD.setdefault(rId, []).append(refId)
                self.__rD[rN] = reverseD
        #
        try:
            return self.__rD[referenceResourceName.upper()][localId]
        except Exception:
            return []

    def getLocalIds(self, referenceResourceName, referenceId):
        """Get the local identifiers (Chemical Component or BIRD) corresponding to identifiers in
        chemical reference resource.

        Args:
            referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
            referenceId (str): identifier in the chemical reference resource

        Returns:
            list: list of local Chemical Component or BIRD identifiers
        """
        try:
            return self.__mapD["mapping"][
                referenceResourceName.upper()][referenceId]
        except Exception:
            return []

    def __getMappingDataPath(self):
        return os.path.join(self.__dirPath, "chemref-mapping-data.json")

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        fD = {}
        ok = False
        mappingPath = self.__getMappingDataPath()
        #
        logger.info("useCache %r mappingPath %r", useCache, mappingPath)
        if useCache and self.__mU.exists(mappingPath):
            fD = self.__mU.doImport(mappingPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload with status (%r) at %s (%.4f seconds)",
                    ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return fD

    def fetchChemRefMapping(self, cfgOb, referenceResourceNameList=None):
        """Fetch reference resource mapping for chemical component and BIRD definitions

        Args:
            cfgOb (obj): instance configuration class ConfigUtil()
            referenceResourceNameList (list, optional): list of chemical reference resources. Defaults to [DrugBank, ChEMBL].

        Returns:
            bool: True for success or False otherwise
        """
        try:
            rnL = referenceResourceNameList if referenceResourceNameList is not None else [
                "DrugBank", "ChEMBL"
            ]
            mD = {}
            crExt = ChemRefExtractor(cfgOb)
            for referenceResourceName in rnL:
                idD = crExt.getChemCompAccessionMapping(
                    referenceResourceName=referenceResourceName)
                logger.info("%s mapping dictionary (%d)",
                            referenceResourceName, len(idD))
                mD[referenceResourceName.upper()] = idD
            #
            fp = self.__getMappingDataPath()
            tS = datetime.datetime.now().isoformat()
            vS = datetime.datetime.now().strftime("%Y-%m-%d")
            ok = self.__mU.doExport(fp, {
                "version": vS,
                "created": tS,
                "mapping": mD
            },
                                    fmt="json",
                                    indent=3)
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Пример #10
0
class PharosTargetCofactorProvider(StashableBase):
    """Accessors for Pharos target cofactors."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirName = "Pharos-cofactors"
        super(PharosTargetCofactorProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, **kwargs)
        #

    def testCache(self, minCount=1):
        logger.info(
            "Pharos cached cofactor count %d",
            len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0)
        if self.__fD and "cofactors" in self.__fD and len(
                self.__fD["cofactors"]) > minCount:
            return True
        else:
            return False

    def hasTarget(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["cofactors"]

    def getTargets(self, rcsbEntityId):
        try:
            return self.__fD["cofactors"][rcsbEntityId.upper()]
        except Exception:
            return []

    def __getCofactorDataPath(self):
        return os.path.join(self.__dirPath, "Pharos-cofactor-data.json")

    def reload(self):
        self.__fD = self.__reload(self.__dirPath, useCache=True)
        return True

    def __reload(self, dirPath, **kwargs):
        startTime = time.time()
        fD = {}
        useCache = kwargs.get("useCache", True)
        ok = False
        cofactorPath = self.__getCofactorDataPath()
        #
        logger.info("useCache %r cofactorPath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        numCofactors = len(fD["cofactors"]) if fD and "cofactors" in fD else 0
        logger.info(
            "Completed reload of (%d) cofactors with status (%r) at %s (%.4f seconds)",
            numCofactors, ok,
            time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
            time.time() - startTime)
        return fD

    def buildCofactorList(self,
                          sequenceMatchFilePath,
                          crmpObj=None,
                          lnmpObj=None,
                          maxActivity=5):
        """Build target cofactor list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path
            crmpObj (obj, optional): instance of ChemRefMappingProviderObj(). Defaults to None.
            lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None.
            maxActivity (int, optional): maximum number of prioritized activity records per target. Defaults to 5.

        Returns:
            bool: True for success or False otherwise

            Example Pharos activity record -

            {
            "version": "2021-06-17",
            "created": "2021-06-17T11:10:54.563394",
            "activity": {
                "2232": [
                    {
                        "smiles": "CC(=CCC\\\\C(=C/Cc1c(O)cc(O)c(C(=O)CCc2ccc(O)cc2)c1O)\\\\C)C",
                        "chemblId": "CHEMBL3360923",
                        "pubChemId": "118724585",
                        "activity": 6.0,
                        "activityType": "IC50",
                        "activityUnits": "nM",
                        "name": "1-[3-(3,7-dimethylocta-2,6-dien-1-yl)-2,4,6-trihydroxyphenyl]-3-(4-hydroxyphenyl)propan-1-one",
                        "pubmedId": "25375026",
                        "patent": "USxxxxxx",
                    }, ...
        """
        rDL = []
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        # ---
        chaP = PharosTargetActivityProvider(cachePath=self.__cachePath,
                                            useCache=True)
        #
        provenanceSource = "Pharos"
        refScheme = "PDB entity"
        assignVersion = chaP.getAssignmentVersion()
        for queryId, matchDL in mD.items():
            # "O43508|uniprotId|7987|proteinId|9606|taxId"
            qCmtD = self.__decodeComment(queryId)
            unpId = qCmtD["uniprotId"]
            queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None
            pharosId = qCmtD["proteinId"]
            if queryTaxId == "-1":
                logger.debug("Skipping target with missing taxonomy %r (%r)",
                             unpId, pharosId)
                continue
            #
            if not chaP.hasTargetActivity(pharosId):
                logger.debug("Skipping target with no activities %r (%r)",
                             unpId, pharosId)
                # continue
            # --
            chemCompNeighborsD = {}
            if lnmpObj:
                for matchD in matchDL:
                    tCmtD = self.__decodeComment(matchD["target"])
                    entryId = tCmtD["entityId"].split("_")[0]
                    entityId = tCmtD["entityId"].split("_")[1]
                    rcsbEntityId = entryId + "_" + entityId
                    chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId)
                    chemCompNeighborsD.update(
                        {k: True
                         for k in chemCompIdList})
            # --
            queryName = chaP.getTargetInfo(pharosId, "name")
            # --
            for matchD in matchDL:
                tCmtD = self.__decodeComment(matchD["target"])
                entryId = tCmtD["entityId"].split("_")[0]
                entityId = tCmtD["entityId"].split("_")[1]
                rcsbEntityId = entryId + "_" + entityId
                #
                taDL = chaP.getTargetActivity(pharosId)
                logger.debug("Target %r has (%d) activity records", pharosId,
                             len(taDL))
                actL = []
                # cfDL = []
                chD = {}
                for taD in taDL:
                    if taD["chemblId"] in chD:
                        chD[taD["chemblId"]] = True
                        continue

                    actD = {
                        "cofactor_id":
                        taD["chemblId"],
                        "cofactor_name":
                        taD["molecule_name"] if "name" in taD else None,
                        "measurement_type":
                        "p" + taD["activityType"],
                        "measurement_value":
                        taD["activity"],
                        "pubmed_ids":
                        [taD["pubmedId"]] if "pubmedId" in taD else None,
                        "patent_nos":
                        taD["patents"] if "patents" in taD else None,
                        "smiles":
                        taD["smiles"] if "smiles" in taD else None,
                        "action":
                        taD["action"] if "action" in taD else None,
                        "pharmacology":
                        taD["pharmacology"] if "pharmacology" in taD else None,
                    }
                    actD = self.__addLocalIds(actD, crmpObj=crmpObj)
                    actL.append(actD)
                #
                actL = self.__activityListSelect(actL,
                                                 chemCompNeighborsD,
                                                 maxActivity=maxActivity)
                if not actL:
                    logger.debug("No Pharos cofactors for %s %s", pharosId,
                                 unpId)
                # ---
                # aligned_target.entity_beg_seq_id (current target is PDB entity in json)
                # aligned_target.target_beg_seq_id (current query is target seq in json)
                # aligned_target.length
                fpL = []
                if "alignedRegions" in matchD:
                    fpL = [{
                        "entity_beg_seq_id": arD["targetBegin"],
                        "target_beg_seq_id": arD["queryBegin"],
                        "length": arD["targetEnd"] - arD["targetBegin"],
                    } for arD in matchD["alignedRegions"]]
                else:
                    fpL = [{
                        "entity_beg_seq_id": matchD["targetBegin"],
                        "target_beg_seq_id": matchD["queryBegin"],
                        "length": matchD["alignLen"],
                    }]
                # ---
                rD = {
                    "entry_id":
                    entryId,
                    "entity_id":
                    entityId,
                    "query_uniprot_id":
                    unpId,
                    "query_id":
                    pharosId,
                    "query_id_type":
                    "Pharos",
                    "query_name":
                    queryName,
                    "provenance_source":
                    provenanceSource,
                    "reference_scheme":
                    refScheme,
                    "assignment_version":
                    assignVersion,
                    "query_taxonomy_id":
                    int(queryTaxId) if queryTaxId else None,
                    "target_taxonomy_id":
                    int(matchD["targetTaxId"])
                    if "targetTaxId" in matchD else None,
                    "aligned_target":
                    fpL,
                    "taxonomy_match_status":
                    matchD["taxonomyMatchStatus"]
                    if "taxonomyMatchStatus" in matchD else None,
                    "lca_taxonomy_id":
                    matchD["lcaTaxId"] if "lcaTaxId" in matchD else None,
                    "lca_taxonomy_name":
                    matchD["lcaTaxName"] if "lcaTaxName" in matchD else None,
                    "lca_taxonomy_rank":
                    matchD["lcaRank"] if "lcaRank" in matchD else None,
                    "cofactors":
                    actL,
                }
                rDL.append(rD)
        #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        #
        fp = self.__getCofactorDataPath()
        tS = datetime.datetime.now().isoformat()
        # vS = datetime.datetime.now().strftime("%Y-%m-%d")
        vS = assignVersion
        ok = self.__mU.doExport(fp, {
            "version": vS,
            "created": tS,
            "cofactors": qD
        },
                                fmt="json",
                                indent=3)
        return ok

    def __addLocalIds(self, cfD, crmpObj=None):
        #
        if crmpObj:
            localIdL = crmpObj.getLocalIds("CHEMBL", cfD["cofactor_id"])
            if localIdL:
                localId = localIdL[0]
                if localId.startswith("PRD_"):
                    cfD["prd_id"] = localId
                else:
                    cfD["chem_comp_id"] = localId
        return cfD

    def __activityListSelect(self,
                             activityDL,
                             chemCompNeighborsD,
                             maxActivity=5):
        """Prioritizing the activity data for locally mapped neighbor ligands and the best binding examples.

        Args:
            activityDL (list): full list of activity objects
            chemCompNeighborsD (dict, optional): index of all chemical components with neighbor interactions to the query target. Defaults {}.
            maxCount (int, optional): maximum number of activity object returned. Defaults to 5.

        Returns:
            list: prioritized and trimmed list of activity objects
        """
        retL = []
        mappedNeighborL = []
        unmappedL = activityDL

        if chemCompNeighborsD:
            unmappedL = []
            # Select out the any cases for molecules that map to a neighbor chemical component.
            for activityD in activityDL:
                if "chem_comp_id" in activityD and activityD[
                        "chem_comp_id"] in chemCompNeighborsD:
                    activityD["neighbor_in_pdb"] = "Y"
                    mappedNeighborL.append(activityD)
                else:
                    unmappedL.append(activityD)
                    activityD["neighbor_in_pdb"] = "N"
        #
        numLeft = maxActivity - len(mappedNeighborL)
        if numLeft > 0:
            unmappedL = sorted(unmappedL,
                               key=lambda k: k["measurement_value"],
                               reverse=True)
            retL = mappedNeighborL
            retL.extend(unmappedL[:numLeft])
            retL = sorted(retL,
                          key=lambda k: k["measurement_value"],
                          reverse=True)
        else:
            logger.debug(
                "Mapped neighbor cofactors (%d) excluded unmapped (%d)",
                len(mappedNeighborL), len(unmappedL))
            retL = sorted(mappedNeighborL,
                          key=lambda k: k["measurement_value"],
                          reverse=True)

        return retL

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD
Пример #11
0
    def search(self, dataList, procName, optionsD, workingDir):
        """Worker method to execute a shell to search CCDC for the input mol2 path list.

        Args:
            dataList (list): list of mol2 file paths to be searched
            procName (str): processName
            optionsD (dict): dictionary of options
            workingDir (str): path to working directory (not used)

        Returns:
            (successList, resultList, []): success and result lists of mol2 paths with CCDC matches
        """
        resultPath = optionsD["resultPath"]
        searchType = optionsD["searchType"]
        pythonRootPath = optionsD["pythonRootPath"]
        csdHome = optionsD["csdHome"]
        timeOut = optionsD["timeOut"]
        timeOut = timeOut if timeOut and timeOut > 0 else 120
        _ = workingDir
        resultList = []
        startTime = time.time()
        logger.info("starting %s at %s", procName,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        #
        try:
            stopPath = os.path.join(resultPath, "STOP")
            logger.info("%s starting search data length %d", procName,
                        len(dataList))
            if self.__checkStop(stopPath):
                logger.info("%s stopping", procName)
                return resultList, resultList, []
            #
            queryListFilePath = os.path.join(resultPath, procName,
                                             "queryFileList.list")
            mU = MarshalUtil()
            ok = mU.doExport(queryListFilePath, dataList, fmt="list")
            if not ok:
                return resultList, resultList, []
            #
            exU = ExecUtils()
            logger.debug("%s executing shell for %s", procName,
                         queryListFilePath)
            cmdPath = os.path.join(pythonRootPath, "bin", "ccdc_search_cli")
            hitListPath = os.path.join(resultPath, procName, "hitList.list")
            logPath = os.path.join(resultPath, procName, "execlog.log")

            logger.debug("cmdPath %r", cmdPath)
            ok = exU.runShell(
                "%s --mol_list_path %s --result_path %s --search_type %s --csdhome %s --hit_list_path %s"
                % (cmdPath, queryListFilePath, resultPath, searchType, csdHome,
                   hitListPath),
                outPath=logPath,
                outAppend=True,
                timeOut=timeOut,
                suppressStderr=False,
            )
            #
            if ok and mU.exists(hitListPath):
                resultList = mU.doImport(hitListPath, fmt="list")
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        endTime = time.time()
        logger.info("%s (result length %d) completed at %s (%.2f seconds)",
                    procName, len(resultList),
                    time.strftime("%Y %m %d %H:%M:%S",
                                  time.localtime()), endTime - startTime)
        return resultList, resultList, []
Пример #12
0
class NeighborInteractionProvider(object):
    """Generators and accessors for non-polymer instance target interactions."""
    def __init__(self, cfgOb, configName, cachePath, **kwargs):
        #
        self.__version = __version__
        self.__cfgOb = cfgOb
        self.__configName = configName
        self.__cachePath = cachePath
        self.__fileLimit = kwargs.get("fileLimit", None)
        self.__dirPath = os.path.join(cachePath, "neighbor-interactions")
        self.__numProc = kwargs.get("numProc", 2)
        self.__chunkSize = kwargs.get("chunkSize", 10)
        useCache = kwargs.get("useCache", True)
        #
        #  - Configuration for stash services -
        #    Local target directory name to be stashed.  (subdir of dirPath)
        #
        self.__stashDir = "ligand-target-neighbors"
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        self.__neighborD = self.__reload(fmt="pickle", useCache=useCache)
        #

    def testCache(self, minCount=0):
        try:
            if minCount == 0:
                return True
            if self.__neighborD and minCount and len(
                    self.__neighborD["entries"]) >= minCount:
                logger.info(
                    "Target neighbor data for (%d) entries created %r version %r",
                    len(self.__neighborD["entries"]),
                    self.__neighborD["created"], self.__neighborD["version"])
                return True
        except Exception:
            pass
        return False

    def getLigandNeighborIndex(self, entryId):
        """Return the target neighbors for the non-polymer instances for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: {(targetAsymId, targetAuthSeqId): nnIndex1, (): nnIndex2}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getTargetNeighborIndex(self, entryId):
        """Return the ligand neighbors for the polymer or branched entity instances in the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {(targetAsymId, targetAuthSeqId): {(ligandAsymId): nnIndex1, (): nnIndex2}

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["targetNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getNearestNeighborList(self, entryId):
        """Return the list of neares neighbors for the entry.

        Args:
            entryId (str): entry identifier

        Returns:
            list: [LigandTargetInstance(), ...]

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["nearestNeighbors"]
        except Exception:
            pass
        return []

    def getLigandNeighborBoundState(self, entryId):
        """Return the dicitonary of ligand instances with isBound boolean status.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: True if isBound,  ...  }
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandIsBoundD"]
        except Exception:
            pass
        return {}

    def getAtomCounts(self, entryId):
        """Return the non-polymer instance atom counts for the input entry (all reported atoms).

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandAtomCountD"]
        except Exception:
            pass
        return {}

    def getHydrogenAtomCounts(self, entryId):
        """Return the non-polymer instance hydrogen atom counts for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandHydrogenAtomCountD"]
        except Exception:
            pass
        return {}

    def hasEntry(self, entryId):
        """Return if the input entry is stored in the cache of non-polymer instance target interactions.

        Args:
            entryId (str): entry identifier

        Returns:
            (bool): True if entry is in the cache or False otherwise
        """
        try:
            return entryId in self.__neighborD["entries"]
        except Exception:
            pass
        return False

    def getEntries(self):
        """Return a list of entry identifier for which non-polymer instance target interactions are stored.

        Returns:
            (list): [entryId, entryId, ... ]
        """
        try:
            return list(self.__neighborD["entries"].keys())
        except Exception:
            pass
        return []

    def generate(self,
                 distLimit=5.0,
                 updateOnly=False,
                 fmt="pickle",
                 indent=0):
        """Generate and export non-polymer target interactions for all of the structures in the repository.

        Args:
            distLimit (float, optional): interaction distance. Defaults to 5.0.
            updateOnly (bool):  only calculate interactions for new entries.  Defaults to False.
            fmt (str, optional): export file format. Defaults to "pickle".
            indent (int, optional): json format indent. Defaults to 0.

        Returns:
            bool: True for success or False otherwise
        """
        ok = False
        try:
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            tD = self.__calculateNeighbors(distLimit=distLimit,
                                           numProc=self.__numProc,
                                           chunkSize=self.__chunkSize,
                                           updateOnly=updateOnly)
            self.__neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": tD
            }
            kwargs = {
                "indent": indent
            } if fmt == "json" else {
                "pickleProtocol": 4
            }
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            ok = self.__mU.doExport(targetFilePath,
                                    self.__neighborD,
                                    fmt=fmt,
                                    **kwargs)
            logger.info("Wrote %r status %r", targetFilePath, ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reload(self, fmt="pickle"):
        self.__neighborD = self.__reload(fmt=fmt, useCache=True)
        return self.__neighborD is not None

    def __reload(self, fmt="pickle", useCache=True):
        """Reload from the current cache file."""
        try:
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": {}
            }
            logger.debug("useCache %r targetFilePath %r", useCache,
                         targetFilePath)
            #
            if useCache and self.__mU.exists(targetFilePath):
                neighborD = self.__mU.doImport(targetFilePath, fmt=fmt)
                if fmt != "pickle":
                    for _, nD in neighborD["entries"].items():
                        nD["nearestNeighbors"] = [
                            LigandTargetInstance(*neighbor)
                            for neighbor in nD["nearestNeighbors"]
                        ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return neighborD

    def __getTargetFilePath(self, fmt="pickle"):
        ext = "pic" if fmt == "pickle" else "json"
        pth = os.path.join(self.__dirPath, "ligand-target-neighbors",
                           "neighbor-data." + ext)
        return pth

    def __calculateNeighbors(self,
                             distLimit=5.0,
                             numProc=2,
                             chunkSize=10,
                             updateOnly=False):
        """Calculate non-polymer target interactions for all repository structure files.

        Args:
            distLimit (float, optional): interaction distance limit. Defaults to 5.0.
            numProc (int, optional): number of processes to use. Defaults to 2.
            chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10.

        Returns:
            (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...}
        """
        contentType = "pdbx"
        mergeContent = None
        rD = {}
        exD = {}
        #
        # updateOnly - will reuse any existing data loaded when this is instantiated
        #              otherwise the cache context is cleared before the calculation.
        if updateOnly:
            exD = {k: True for k in self.getEntries()}
            rD = self.__neighborD[
                "entries"] if "entries" in self.__neighborD else {}
        #
        locatorObjList = self.__rpP.getLocatorObjList(
            contentType=contentType,
            mergeContentTypes=mergeContent,
            excludeIds=exD)
        logger.info("Starting with %d numProc %d updateOnly (%r)",
                    len(locatorObjList), self.__numProc, updateOnly)
        #
        rWorker = TargetInteractionWorker(self.__rpP)
        mpu = MultiProcUtil(verbose=True)
        optD = {"distLimit": distLimit}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="build")
        ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        if failList:
            logger.info("Target interaction build failures (%d): %r",
                        len(failList), failList)
        #
        for (entryId, nD) in resultList[0]:
            rD[entryId] = nD
        #
        logger.info(
            "Completed with multi-proc status %r failures %r total entries with data (%d)",
            ok, len(failList), len(rD))
        return rD

    def toStash(self):
        ok = False
        try:
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                           sectionName=self.__configName)
            ok = self.__toStash(url,
                                basePath,
                                userName=userName,
                                password=password)
            ok = self.__toStash(urlFallBack,
                                basePath,
                                userName=userName,
                                password=password)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __toStash(self,
                  url,
                  stashRemoteDirPath,
                  userName=None,
                  password=None,
                  remoteStashPrefix=None):
        """Copy tar and gzipped bundled cache data to remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.makeBundle(self.__dirPath, [self.__stashDir])
            if ok:
                ok = stU.storeBundle(url,
                                     stashRemoteDirPath,
                                     remoteStashPrefix=remoteStashPrefix,
                                     userName=userName,
                                     password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def fromStash(self):
        try:
            minCount = 10
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            #
            ok = self.__fromStash(url,
                                  basePath,
                                  userName=userName,
                                  password=password)
            ok = self.reload()
            ok = self.testCache(minCount=minCount)
            if not ok:
                urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                               sectionName=self.__configName)
                ok = self.__fromStash(urlFallBack,
                                      basePath,
                                      userName=userName,
                                      password=password)
                ok = self.testCache(minCount=minCount)
                ok = self.reload()
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return ok

    def __fromStash(self,
                    url,
                    stashRemoteDirPath,
                    userName=None,
                    password=None,
                    remoteStashPrefix=None):
        """Restore local cache from a tar and gzipped bundle to fetched from a remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.fetchBundle(self.__dirPath,
                                 url,
                                 stashRemoteDirPath,
                                 remoteStashPrefix=remoteStashPrefix,
                                 userName=userName,
                                 password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def convert(self, fmt1="json", fmt2="pickle"):
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt1)
        self.__neighborD = self.__mU.doImport(targetFilePath, fmt=fmt1)
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt2)
        ok = self.__mU.doExport(targetFilePath,
                                self.__neighborD,
                                fmt=fmt2,
                                pickleProtocol=4)
        return ok
Пример #13
0
class EntityInstanceExtractor(object):
    """Selected utilities to extract data from entity instance collections.

    >>> from operator import itemgetter
    >>>
    >>> seq2 = [1, 2, 4, 5, 6, 8, 9, 10]
    >>> list = []
    >>> for k, g in groupby(enumerate(seq2), lambda (i,x):i-x):
    ...     list.append(map(itemgetter(1), g))
    ...
    >>> print list
    [[1, 2], [4, 5, 6], [8, 9, 10]]
    Or as a list comprehension:

    >>> [map(itemgetter(1), g) for k, g in groupby(enumerate(seq2), lambda (i,x):i-x)]
    [[1, 2], [4, 5, 6], [8, 9, 10]]


    ##
    ##

    import numpy as np

    def main():
        # Generate some random data
        x = np.cumsum(np.random.random(1000) - 0.5)
        condition = np.abs(x) < 1

        # Print the start and stop indicies of each region where the absolute
        # values of x are below 1, and the min and max of each of these regions
        for start, stop in contiguous_regions(condition):
            segment = x[start:stop]
            print start, stop
            print segment.min(), segment.max()

    import numpy as np

    Samples = np.array([[1, 2, 3],
                       [1, 2]])
    c = np.hstack(Samples)  # Will gives [1,2,3,1,2]
    mean, std = np.mean(c), np.std(c)
    newSamples = np.asarray([(np.array(xi)-mean)/std for xi in Samples])
    print newSamples

    """
    def __init__(self, cfgOb):
        self.__cfgOb = cfgOb
        self.__resourceName = "MONGO_DB"
        #
        self.__seqCache = {}
        self.__mU = MarshalUtil()
        #

    def getEntryInfo(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        resLimit = kwargs.get("resLimit", 3.5)
        expMethod = kwargs.get("expMethod", "X-ray")
        #
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    qD = {
                        "rcsb_entry_info.experimental_method": expMethod,
                        "refine.0.ls_d_res_high": {
                            "$lte": resLimit
                        }
                    }
                    selectL = [
                        "rcsb_entry_container_identifiers", "rcsb_entry_info",
                        "refine"
                    ]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for dV in dL:
                        if "rcsb_entry_container_identifiers" not in dV:
                            continue
                        entryId = dV["rcsb_entry_container_identifiers"][
                            "entry_id"]
                        entryD[entryId] = {}
                        if "rcsb_entry_info" in dV and "polymer_composition" in dV[
                                "rcsb_entry_info"]:
                            entryD[entryId] = {
                                "polymer_composition":
                                dV["rcsb_entry_info"]["polymer_composition"],
                                "experimental_method":
                                dV["rcsb_entry_info"]["experimental_method"],
                            }
                        if "refine" in dV and dV[
                                "refine"] and "ls_d_res_high" in dV["refine"][
                                    0]:
                            entryD[entryId]["ls_d_res_high"] = dV["refine"][0][
                                "ls_d_res_high"]
                            logger.debug("Got res %r",
                                         dV["refine"][0]["ls_d_res_high"])

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
        #

    def getEntityIds(self, entryIdList):
        """ """
        dbName = "pdbx_core"
        collectionName = "pdbx_core_polymer_entity"
        docD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    for entryId in entryIdList:
                        qD = {
                            "rcsb_polymer_entity_container_identifiers.entry_id":
                            entryId
                        }
                        selectL = ["rcsb_polymer_entity_container_identifiers"]
                        tL = mg.fetch(dbName,
                                      collectionName,
                                      selectL,
                                      queryD=qD)
                        #
                        logger.debug("Selection %r fetch result count %d",
                                     selectL, len(tL))
                        docD[entryId] = [
                            vv["rcsb_polymer_entity_container_identifiers"]
                            for vv in tL
                        ]
            logger.debug("docD is %r", docD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return docD

    def getPolymerEntities(self, entryD, **kwargs):
        """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary."""
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")
        savePath = kwargs.get("savePath", "entry-data.pic")
        entryLimit = kwargs.get("entryLimit", None)
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity_poly.type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {
                            "rcsb_polymer_entity_container_identifiers.entry_id":
                            entryId,
                            "entity_poly.rcsb_entity_polymer_type": "Protein",
                            "entity.rcsb_multiple_source_flag": "N",
                        }
                        #
                        dL = mg.fetch(dbName,
                                      collectionName,
                                      selectL,
                                      queryD=qD)
                        logger.debug("%s query %r fetch result count %d",
                                     entryId, qD, len(dL))
                        eD = {}
                        for ii, dV in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dV)
                            if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV[
                                    "rcsb_polymer_entity_container_identifiers"]:
                                rD["asym_ids"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "asym_ids"]
                                rD["entity_id"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "entity_id"]
                            if "entity_poly" in dV and "type" in dV[
                                    "entity_poly"]:
                                rD["type"] = dV["entity_poly"]["type"]
                                rD["seq_one_letter_code_can"] = dV[
                                    "entity_poly"][
                                        "pdbx_seq_one_letter_code_can"]

                            if "rcsb_entity_source_organism" in dV:
                                rD["ncbi_taxonomy_id"] = dV[
                                    "rcsb_entity_source_organism"][0][
                                        "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV[
                                            "rcsb_entity_source_organism"][
                                                0] else None
                                rD["ncbi_scientific_name"] = (
                                    dV["rcsb_entity_source_organism"][0]
                                    ["ncbi_scientific_name"]
                                    if "ncbi_scientific_name"
                                    in dV["rcsb_entity_source_organism"][0]
                                    else None)

                            if "struct_ref" in dV and len(
                                    dV["struct_ref"]) == 1:
                                rD["seq_one_letter_code_ref"] = dV["struct_ref"][
                                    0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV[
                                        "struct_ref"][0] else None
                                rD["db_accession"] = dV["struct_ref"][0][
                                    "pdbx_db_accession"] if "pdbx_db_accession" in dV[
                                        "struct_ref"][0] else None
                                rD["db_name"] = dV["struct_ref"][0][
                                    "db_name"] if "db_name" in dV[
                                        "struct_ref"][0] else None
                                #
                                refDbName = rD["db_name"]
                                dbAccession = rD["db_accession"]
                                dbRefSeq = self.__seqCache[
                                    dbAccession] if dbAccession in self.__seqCache else None

                                if refDbName in ["UNP"] and not dbRefSeq:
                                    dbRefSeq = self.__fetchUniprot(dbAccession)
                                    self.__seqCache[dbAccession] = dbRefSeq
                                    logger.debug("Fetch uniprot %r", dbRefSeq)
                                rD["ref_db_seq"] = dbRefSeq
                            else:
                                rD["seq_one_letter_code_ref"] = rD[
                                    "db_accession"] = rD["db_name"] = None
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 10 == 0:
                            logger.info(
                                "Completed polymer entities fetch %d/%d entries",
                                iCount, len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break
            #
            # for entryId in entryD:
            #    logger.debug(">>  %s docD  %r" % (entryId, entryD[entryId]))
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer entity results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD

    def getEntityInstances(self, entryD, **kwargs):
        """Get the selected validation data for the instances in the input entry dictionary.

        entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {}

        Add keys: 'pdbx_vrpt_instance_results'  and  'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above.

        Args:
            resourceName (str):  resource name (e.g. DrugBank, CCDC)
            **kwargs: unused

        Returns:
            entryD: { }
        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity_instance")
        savePath = kwargs.get("savePath", "entry-data.pic")
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        entryLimit = kwargs.get("entryLimit", None)
        #
        try:
            optF = False
            iCount = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s total document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    #
                    for entryId, dV in entryD.items():
                        for entityId, peD in dV[
                                "selected_polymer_entities"].items():
                            # if 'anal_instances' in peD:
                            #    continue
                            vD = {}
                            for asymId in peD["asym_ids"]:
                                qD = {
                                    "rcsb_polymer_entity_instance_container_identifiers.entry_id":
                                    entryId,
                                    "rcsb_polymer_entity_instance_container_identifiers.asym_id":
                                    asymId,
                                }
                                # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'}
                                # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues']
                                selectL = ["pdbx_vrpt_instance_results"]
                                tL = mg.fetch(dbName,
                                              collectionName,
                                              selectL,
                                              queryD=qD)
                                dV = {}
                                if not tL:
                                    logger.info(
                                        "No validation data for %s %s %s(%s)",
                                        dbName, collectionName, entryId,
                                        asymId)
                                    continue
                                #
                                logger.debug(
                                    ">>> %s %s (%s) dict key length %d ",
                                    collectionName, entryId, asymId,
                                    len(tL[0]))

                                #
                                if optF:
                                    dV["pdbx_vrpt_instance_results"] = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    dV["pdbx_unobs_or_zero_occ_residues"] = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                #
                                if optF:
                                    urdL = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": urd["label_seq_id"],
                                        "label_comp_id": urd["label_comp_id"]
                                    } for urd in urdL]
                                    dV["pdbx_unobs_or_zero_occ_residues"] = oL
                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_seq"] = oL
                                except Exception as e:
                                    logger.error(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "OWAB": ird["OWAB"],
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_occ"] = oL
                                except Exception as e:
                                    logger.debug(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                vD[asymId] = copy.copy(dV)
                                #
                            analD = self.analEntity(entryId, peD, vD)
                            entryD[entryId]["selected_polymer_entities"][
                                entityId]["anal_instances"] = copy.copy(analD)
                        iCount += 1
                        if iCount % 500 == 0:
                            logger.info("Completed %d/%d entries", iCount,
                                        len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity instance results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            break
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer instance results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD

    def analEntity(self, entryId, entityD, vD, **kwargs):
        """

        {'polymer_composition': 'protein/NA', 'experimental_method': 'X-ray',
        'selected_polymer_entities': {'1': {'asym_ids': ['D', 'C', 'E', 'A', 'B', 'F'],
                   'entity_id': '1', 'type': 'polypeptide(L)',
                   'seq_one_letter_code_can': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
                   'ncbi_taxonomy_id': 511693,
                    'ncbi_scientific_name': 'Escherichia coli BL21',
                    'seq_one_letter_code_ref': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
                    'db_accession': 'C5W5L7',
                    'db_name': 'UNP',
                    'validation': {'D': {'pdbx_vrpt_instance_results': [{'OWAB': 29.45, 'label_seq_id': 5, 'label_comp_id': 'GLN'},
                                                                            {'OWAB': 26.12, 'label_seq_id': 6, 'label_comp_id': 'SER'},
                                                                            {'OWAB': 22.72, 'label_seq_id': 7, 'label_comp_id': 'LEU'},
                                                                            {'OWAB': 14.56, 'label_seq_id': 8, 'label_comp_id': 'GLN'},
                                                                            {'OWAB': 19.18, 'label_seq_id': 9, 'label_comp_id': 'ASP'},
                                                                            {'OWAB': 16.56, 'label_seq_id': 10, 'label_comp_id': 'PRO'},
                                                                            {'OWAB': 14.78, 'label_seq_id': 11, 'label_comp_id': 'PHE'},
                                                                            {'OWAB': 11.2, 'label_seq_id': 12, 'label_comp_id': 'LEU'}, }}...]

                                        'pdbx_unobs_or_zero_occ_residues': [{'label_seq_id': 1, 'label_comp_id': 'MET'},
                                               {'label_seq_id': 2, 'label_comp_id': 'ALA'},
                                                {'label_seq_id': 3, 'label_comp_id': 'LYS'},
                                                 {'label_seq_id': 4, 'label_comp_id': 'GLY'}]}

        """
        _ = kwargs
        analD = {}
        try:
            entityId = entityD["entity_id"]
            asymIdL = entityD["asym_ids"]

            refSeq = entityD[
                "seq_one_letter_code_ref"] if "seq_one_letter_code_ref" in entityD else None
            entitySeq = entityD[
                "seq_one_letter_code_can"] if "seq_one_letter_code_can" in entityD else None
            # -------
            # Get UniProt
            #
            dbName = entityD["db_name"] if "db_name" in entityD else None
            dbAccession = entityD[
                "db_accession"] if "db_accession" in entityD else None
            dbRefSeq = entityD[
                "ref_db_seq"] if "ref_db_seq" in entityD else None
            # --
            if dbRefSeq:
                logger.debug("%s (%s) ref db %4d:  %r", dbAccession, dbName,
                             len(dbRefSeq), dbRefSeq)
            if refSeq:
                logger.debug("%s (%s) seq ref pdb %4d:  %r", dbAccession,
                             dbName, len(refSeq), refSeq)
            if entitySeq:
                logger.debug("%s (%s) entity sample %4d:  %r", dbAccession,
                             dbName, len(entitySeq), entitySeq)
            #
            lenRefDbSeq = len(dbRefSeq) if dbRefSeq else None
            lenEntitySeq = len(entitySeq)
            # sampleSeqCov = 1.0 - float(lenRefDbSeq - lenEntitySeq) / float(lenRefDbSeq) if lenRefDbSeq else None
            #

            # -
            for asymId in asymIdL:
                if asymId not in vD:
                    logger.error("Missing validation data for %s %s %s",
                                 entryId, entityId, asymId)
                    continue
                #
                irDL = vD[asymId][
                    "pdbx_vrpt_instance_results_seq"] if "pdbx_vrpt_instance_results_seq" in vD[
                        asymId] else []
                lsL = list(set([dV["label_seq_id"] for dV in irDL]))
                lenInstanceSeq = len(lsL)

                instRefDbSeqCov = 1.0 - float(
                    lenRefDbSeq - lenInstanceSeq) / float(
                        lenRefDbSeq) if lenRefDbSeq else None
                instSampleSeqCov = 1.0 - float(
                    lenEntitySeq - lenInstanceSeq) / float(lenEntitySeq)
                #
                occDL = vD[asymId][
                    "pdbx_vrpt_instance_results_occ"] if "pdbx_vrpt_instance_results_occ" in vD[
                        asymId] else []
                # average the
                owabRegD = {}
                if occDL:
                    owabD = {}
                    for dV in occDL:
                        owabD.setdefault(dV["label_seq_id"],
                                         []).append(dV["OWAB"])
                    #
                    # logger.info("owabD %r" % owabD)
                    meanOwabD = {k: mean(v) for k, v in owabD.items()}
                    meanOwab = mean(meanOwabD.values())
                    stdevOwab = stdev(meanOwabD.values())
                    #
                    logger.debug(
                        ">> Length of B values list %d mean %.3f stdev %.3f",
                        len(meanOwabD), meanOwab, stdevOwab)
                    #
                    meanOwabA = np.array(list(meanOwabD.values()))
                    #
                    condition = meanOwabA > (meanOwab + meanOwab)
                    regL = self.__contiguousRegions(condition)
                    for ii, (start, stop) in enumerate(regL, 1):
                        segment = meanOwabA[start:stop]
                        logger.debug(
                            "B value range =  start %d stop %d min %.3f max %.3f",
                            start, stop, segment.min(), segment.max())
                        owabRegD[ii] = {
                            "length": stop - start + 1,
                            "occ_min": segment.min(),
                            "occ_max": segment.max()
                        }

                #
                #
                # if False:
                #    uDL = vD[asymId]['pdbx_unobs_or_zero_occ_residues'] if 'pdbx_unobs_or_zero_occ_residues' in vD[asymId] else []
                #    unobsL = [d['label_seq_id'] for d in uDL]
                #
                # segL = []
                # for k, g in groupby(enumerate(lsL), lambda x: x[0] - x[1]):
                #    logger.info(" Segment entryId %s entityId %s asymId %s:  %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g))))
                #
                # for k, g in groupby(enumerate(lsL), lambda(i, x): i - x):
                #    logger.info(" entryId %s entityId %s asymId %s:  %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g)))

                segL = [
                    list(map(itemgetter(1), g))
                    for _, g in groupby(enumerate(lsL), lambda x: x[0] - x[1])
                ]
                logger.debug("Modeled sequence length %d segments %d",
                             len(lsL), len(segL))
                #
                gapD = {}
                for ii in range(1, len(segL)):
                    bG = segL[ii - 1][-1]
                    eG = segL[ii][0]
                    gapD[ii] = eG - bG - 1
                    logger.debug("Gap %d length %d", ii, gapD[ii])
                #
                #
                if instRefDbSeqCov:
                    logger.debug(
                        "Summary %s %s %s refcov %.2f  sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
                        entryId,
                        entityId,
                        asymId,
                        instRefDbSeqCov,
                        instSampleSeqCov,
                        len(gapD),
                        list(gapD.values()),
                        len(owabRegD),
                        list(owabRegD.values()),
                    )
                else:
                    logger.debug(
                        "Summary %s %s %s sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
                        entryId,
                        entityId,
                        asymId,
                        instSampleSeqCov,
                        len(gapD),
                        list(gapD),
                        len(owabRegD),
                        list(owabRegD.values()),
                    )
                #
                analD[asymId] = {
                    "coverage_inst_refdb": instRefDbSeqCov,
                    "coverage_inst_entity": instSampleSeqCov,
                    "gapD": copy.copy(gapD),
                    "owabRegiond": copy.copy(owabRegD)
                }
                logger.debug("entry %s entity %s analD %r", entryId, entityId,
                             analD)
        except Exception as e:
            logger.exception("%s failing with %s", entryId, str(e))
        #
        return analD

    def __getSegments(self, values):
        xV = np.asarray(values)
        # Generate some random data
        # x = np.cumsum(np.random.random(1000) - 0.5)
        #
        condition = np.abs(xV) < 1

        # Print the start and stop indicies of each region where the absolute
        # values of x are below 1, and the min and max of each of these regions
        for start, stop in self.__contiguousRegions(condition):
            segment = xV[start:stop]
            print(start, stop)
            print(segment.min(), segment.max())

    def __contiguousRegions(self, condition):
        """Finds contiguous True regions of the boolean array "condition.

        Returns a 2D array where the first column is the start index of the region and the
        second column is the end index.

        """

        # Find the indicies of changes in "condition"
        dV = np.diff(condition)
        (idx, ) = dV.nonzero()

        # We need to start things after the change in "condition". Therefore,
        # we'll shift the index by 1 to the right.
        idx += 1

        if condition[0]:
            # If the start of condition is True prepend a 0
            idx = np.r_[0, idx]

        if condition[-1]:
            # If the end of condition is True, append the length of the array
            idx = np.r_[idx, condition.size]  # Edit

        # Reshape the result into two columns
        idx.shape = (-1, 2)
        return idx

    def __window(self, seq, num=2):
        """Returns a sliding window (of width n) over data from the iterable
        s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
        """
        it = iter(seq)
        result = tuple(islice(it, num))
        if len(result) == num:
            yield result
        for elem in it:
            result = result[1:] + (elem, )
            yield result

    def missingElements(self, lV):
        missing = chain.from_iterable(
            range(x + 1, y) for x, y in self.__window(lV) if (y - x) > 1)
        return list(missing)

    def __fetchUniprot(self, uniProtId):
        baseUrl = "http://www.uniprot.org"
        wsEndPoint = "/uniprot/"
        fS = ""
        try:
            fullUrl = baseUrl + wsEndPoint + uniProtId + ".fasta"
            result = requests.get(fullUrl)
            if result.ok:
                fL = result.text.split("\n")
                fS = "".join(fL[1:])
            else:
                logger.error("UniProt Fasta request for %s returns status %r",
                             uniProtId, result.status_code)
        except Exception as e:
            logger.error("Failing request for %s with %s", uniProtId, str(e))
        return fS
Пример #14
0
class ChEMBLTargetMechanismProvider(StashableBase):
    """Accessors for ChEMBL target mechanism data."""
    def __init__(self, cachePath, useCache):
        #
        self.__cachePath = cachePath
        self.__dirName = "ChEMBL-target-mechanism"
        super(ChEMBLTargetMechanismProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        baseVersion = 28
        self.__version = baseVersion
        logger.info("ChEMBL API MAX_LIMIT %r", Settings.Instance().MAX_LIMIT)  # pylint: disable=no-member
        self.__aD = self.__reload(self.__dirPath, useCache)

    def testCache(self, minCount=0):
        if minCount == 0:
            return True
        if self.__aD and (len(self.__aD) > minCount):
            logger.info("Mechanism data for (%d) targets", len(self.__aD))
            return True
        return False

    def getAssignmentVersion(self):
        return self.__version

    def getTargetMechanismDataPath(self):
        return os.path.join(self.__dirPath,
                            "chembl-target-mechanism-data.json")

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        aD = {}
        fU = FileUtil()
        fU.mkdir(dirPath)
        targetMechanismFilePath = self.getTargetMechanismDataPath()
        #
        if useCache and fU.exists(targetMechanismFilePath):
            logger.info("useCache %r using %r", useCache,
                        targetMechanismFilePath)
            qD = self.__mU.doImport(targetMechanismFilePath, fmt="json")
            aD = qD["mechanism"] if "mechanism" in qD else {}
        #
        logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        #
        return aD

    def getTargetMechanisms(self, targetChEMBLId):
        try:
            return self.__aD[
                targetChEMBLId] if targetChEMBLId in self.__aD else []
        except Exception:
            return []

    def hasTargetMechanism(self, targetChEMBLId):
        try:
            return targetChEMBLId in self.__aD
        except Exception:
            return False

    def fetchTargetMechanismData(self,
                                 targetChEMBLIdList,
                                 skipExisting=True,
                                 chunkSize=50):
        """Get cofactor mechanism data for the input ChEMBL target list.

        Args:
            targetChEMBLIdList (list): list of ChEMBL target identifiers
            skipExisting (bool, optional): reuse any existing cached data (default: True)
            chunkSize(int, optional): ChEMBL API batch size for fetches (default: 50)

        Returns:
          bool:  True for success or False otherwise

        """
        atL = [
            "action_type",
            "molecule_chembl_id",
            "action_type",
            "mechanism_of_action",
            "max_phase",
            "target_chembl_id",
        ]
        targetD = self.__aD if self.__aD else {}
        idList = []
        if skipExisting:
            for tId in targetChEMBLIdList:
                if tId in self.__aD:
                    continue
                idList.append(tId)
        else:
            idList = targetChEMBLIdList

        numToProcess = len(idList)
        logger.info("Fetching mechanism data for (%d/%d)", numToProcess,
                    len(targetChEMBLIdList))
        ok = False
        try:
            for ii in range(0, len(idList), chunkSize):
                logger.info("Begin chunk at ii %d/%d", ii, numToProcess)
                mch = new_client.mechanism  # pylint: disable=no-member
                mch.set_format("json")
                mDL = mch.filter(
                    target_chembl_id__in=idList[ii:ii + chunkSize]).only(atL)

                logger.info("Results (%d)", len(mDL))
                if mDL:
                    for mD in mDL:
                        targetD.setdefault(mD["target_chembl_id"], []).append(
                            self.__mechanismSelect(atL, mD))
                #
                logger.info("Completed chunk starting at (%d)", ii)
                tS = datetime.datetime.now().isoformat()
                vS = datetime.datetime.now().strftime("%Y-%m-%d")
                ok = self.__mU.doExport(self.getTargetMechanismDataPath(), {
                    "version": vS,
                    "created": tS,
                    "mechanism": targetD
                },
                                        fmt="json",
                                        indent=3)
                logger.info("Wrote completed chunk starting at (%d) (%r)", ii,
                            ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __mechanismSelect(self, atL, aD):
        return {at: aD[at] if at in aD else None for at in atL}
Пример #15
0
class ValidationReportSchemaUtilsTests(unittest.TestCase):
    def setUp(self):
        self.__dirPath = os.path.join(os.path.dirname(TOPDIR), "rcsb",
                                      "mock-data")
        self.__xsdPath = os.path.join(HERE, "test-data",
                                      "wwpdb_validation_v004.xsd")
        self.__dictPath = os.path.join(HERE, "test-output",
                                       "vrpt_mmcif_ext_v4.dic")
        self.__dictStaticPath = os.path.join(HERE, "test-data",
                                             "em_validation_ext_v4.dic")
        #
        # This schema mapping file is used by the XML report data file reader.
        self.__dictionaryMapPath = os.path.join(HERE, "test-output",
                                                "vrpt_dictmap_v4.json")
        self.__dictionaryMapCsvPath = os.path.join(HERE, "test-output",
                                                   "vrpt_dictmap_v4.csv")
        self.__mU = MarshalUtil()

    def tearDown(self):
        pass

    def testProcessXsdSchema(self):
        vrsu = ValidationReportSchemaUtils()
        sObj = vrsu.readSchema(self.__xsdPath, verbose=False)
        logger.debug("Returns type %r", type(sObj))
        logger.debug("Schema category length %d", len(sObj))
        ok = self.__mU.doExport(os.path.join(HERE, "test-output",
                                             "schema-object.json"),
                                sObj,
                                fmt="json",
                                indent=3)

        # import static definitions -
        scL = self.__mU.doImport(self.__dictStaticPath, fmt="mmcif-dict")
        logger.info("Static definition count %d", len(scL))
        #
        cL = vrsu.buildDictionary(sObj)
        logger.info("Generated definition count %d", len(cL))
        #
        cL.extend(scL)
        ok = self.__mU.doExport(self.__dictPath, cL, fmt="mmcif-dict")
        self.assertTrue(ok)
        #
        dictionaryMap = vrsu.getDictionaryMap(sObj)
        ok = self.__mU.doExport(self.__dictionaryMapPath,
                                dictionaryMap,
                                fmt="json")
        self.assertTrue(ok)
        #
        self.assertTrue("attributes" in dictionaryMap)
        self.assertTrue(len(dictionaryMap["attributes"]) > 420)

    def testExportMapping(self):
        """Export schema correspondences as CSV."""
        vrsu = ValidationReportSchemaUtils()
        sObj = vrsu.readSchema(self.__xsdPath)
        dictionaryMap = vrsu.getDictionaryMap(sObj)
        logger.info("Attribute count %d", len(dictionaryMap["attributes"]))
        rL = []
        for ky, dD in dictionaryMap["attributes"].items():
            kyL = ky.split("|")
            catN = kyL[0]
            atN = kyL[1]
            row = {
                "xml_el": catN,
                "xml_at": atN,
                "mmcif_cat": dD["cat"],
                "mmcif_at": dD["at"]
            }
            rL.append(row)
        #
        #
        self.__mU.doExport(self.__dictionaryMapCsvPath, rL, fmt="csv")
Пример #16
0
    def buildSearchFiles(self, **kwargs):
        """Build cif, sdf (optional), and mol2 files for components in the chemical component search index.
           Exclude ions or other extraneous molecules lacking bonds.

        Args:
            ccUrlTarget (str): locator for source chemical component dictionary (default: full public dictionary)
            birdUrlTarget (str): locator for source BIRD dictionary (default: full public dictionary)
            limitPerceptions (bool): restrict automatic perceptions in OE molecular build operations (default: False)
            numProc (int): number of processors
            useCache (bool): use existing resource file where possible (default: True)
            molLimit (str):  limit the number to ingested chemical compont (default: None)
            quietFlag (bool): suppress output in OE library operations (default: True)

        Returns:
            (int): number molfiles generated
        """
        cachePath = self.__cachePath
        ccUrlTarget = kwargs.get("ccUrlTarget", None)
        birdUrlTarget = kwargs.get("birdUrlTarget", None)
        molLimit = kwargs.get("molLimit", None)
        quietFlag = kwargs.get("quietFlag", True)
        fpTypeList = kwargs.get("fpTypeList", [])
        screenTypeList = kwargs.get("screenTypeList", [])
        ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full"
        oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe-cc-full"
        numProc = kwargs.get("numProc", 2)
        minCount = kwargs.get("minCount", 0)
        useCache = kwargs.get("useCache", True)
        useSdf = kwargs.get("useSdf", True)
        useMol2 = kwargs.get("useMol2", False)
        limitPerceptions = kwargs.get("limitPerceptions", False)
        logSizes = False
        #
        startTime = time.time()
        ccmP = ChemCompMoleculeProvider(cachePath=cachePath,
                                        useCache=useCache,
                                        ccFileNamePrefix=ccFileNamePrefix,
                                        ccUrlTarget=ccUrlTarget,
                                        birdUrlTarget=birdUrlTarget,
                                        molLimit=molLimit)
        ok = ccmP.testCache(minCount=minCount, logSizes=logSizes)
        logger.info(
            "Completed chemical component provider load %r (%.4f seconds)", ok,
            time.time() - startTime)
        #
        startTime = time.time()
        oesmp = OeSearchMoleculeProvider(
            ccUrlTarget=ccUrlTarget,
            birdUrlTarget=birdUrlTarget,
            cachePath=cachePath,
            ccFileNamePrefix=ccFileNamePrefix,
            oeFileNamePrefix=oeFileNamePrefix,
            useCache=useCache,
            quietFlag=quietFlag,
            fpTypeList=fpTypeList,
            screenTypeList=screenTypeList,
            numProc=numProc,
            molLimit=molLimit,
            limitPerceptions=limitPerceptions,
        )
        ok = oesmp.testCache()
        logger.info("Completed OE molecule provider load %r (%.4f seconds)",
                    ok,
                    time.time() - startTime)
        #
        startTime = time.time()
        ccSIdxP = ChemCompSearchIndexProvider(
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix,
            limitPerceptions=limitPerceptions,
            numProc=numProc)
        ok = ccSIdxP.testCache()
        logger.info(
            "Completed chemical component search index load %r (%.4f seconds)",
            ok,
            time.time() - startTime)
        #
        ccSIdx = ccSIdxP.getIndex() if ccSIdxP and ok else {}
        logger.info("Search index status %r index length %d", ok, len(ccSIdx))
        #
        ccIdD = {}
        mU = MarshalUtil()
        oeU = OeIoUtils(dirPath=cachePath)
        numMols = 0
        searchFileDirPath = self.getSearchDirFilePath()
        pathTupList = []
        for sId in ccSIdx:
            ccId = sId.split("|")[0]
            # standard CIF definition
            if ccId not in ccIdD:
                cifPath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                       ccId + ".cif")
                if not (useCache and mU.exists(cifPath)):
                    ccMol = ccmP.getMol(ccId)
                    if not self.__checkCif(ccMol):
                        continue
                    mU.doExport(cifPath, [ccMol], fmt="mmcif")
            #
            oeMol = oesmp.getMol(sId)
            if not self.__checkOeMol(oeMol):
                continue
            #
            # Sanity checks on the generated OE molecule
            #
            cifPath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                   sId + ".cif")
            if sId != ccId and not (useCache and mU.exists(cifPath)):
                oeccU = OeChemCompUtils()
                ok = oeccU.addOeMol(sId,
                                    oeMol,
                                    missingModelXyz=True,
                                    writeIdealXyz=False)
                if ok:
                    oeccU.write(cifPath)

            if useSdf:
                molFilePath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                           sId + ".sdf")
                if not (useCache and mU.exists(molFilePath)):
                    ok = oeU.write(molFilePath,
                                   oeMol,
                                   constantMol=False,
                                   addSdTags=True)
                    if ok:
                        pathTupList.append((sId, molFilePath, "sdf"))
            #
            if useMol2:
                mol2FilePath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                            sId + ".mol2")
                if not (useCache and mU.exists(mol2FilePath)):
                    oeU.write(mol2FilePath,
                              oeMol,
                              constantMol=False,
                              addSdTags=True)
                    if ok:
                        pathTupList.append((sId, mol2FilePath, "mol2"))
            numMols += 1
        #
        self.__storePathList(pathTupList)
        return numMols
class ChemAxonDescriptorProvider(StashableBase):
    """Utilities to deliver ChemAxon rendered chemical descriptors for chemical component definitions."""
    def __init__(self, **kwargs):
        #
        dirName = "chemaxon"
        if "cachePath" in kwargs:
            self.__cachePath = os.path.abspath(kwargs.get("cachePath", None))
            self.__dirPath = os.path.join(self.__cachePath, dirName)
        super(ChemAxonDescriptorProvider,
              self).__init__(self.__cachePath, [dirName])
        #
        self.__molLimit = kwargs.get("molLimit", 0)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        useCache = kwargs.get("useCache", True)
        self.__chunkSize = kwargs.get("chunkSize", 100)
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
        self.__version = None
        self.__descrD = self.__reload(useCache)

    def testCache(self, minCount=None):
        ok = self.__descrD and len(
            self.__descrD
        ) >= minCount if minCount else self.__descrD is not None
        logger.info(
            "Loaded ChemAxon descriptors for (%d) components (success %r)",
            len(self.__descrD) if self.__descrD else 0, ok)
        return ok

    def getDescriptorIndex(self):
        return self.__descrD

    def getIndexFilePath(self):
        return os.path.join(
            self.__dirPath,
            "%s-chemaxon-descriptors.json" % self.__ccFileNamePrefix)

    def getVersion(self):
        return self.__version

    def __reload(self, useCache):
        """Reload or created Chemaxon descriptor mapping index.

        Args:
            cachePath (str): path to the directory containing cache files
            chunkSize (int, optional): number of SMILES per request. Defaults to 100.

         Returns:
            (dict): chemical component data containers for each indexed chemical component
        """
        #
        descrD = {}
        descrFilePath = self.getIndexFilePath()
        #
        if not (useCache and self.__mU.exists(descrFilePath)):
            url = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/CHEMAXON/cc-full-chemaxon-descriptors.json"
            _ = self.__fetchUrl(url, self.__dirPath)
        #
        _, fExt = os.path.splitext(descrFilePath)
        descrFormat = "json" if fExt == ".json" else "pickle"
        if self.__mU.exists(descrFilePath):
            dD = self.__mU.doImport(descrFilePath, fmt=descrFormat)
            descrD = dD["smiles"]
            self.__version = dD["version"]
        #
        return descrD

    def __fetchUrl(self, urlTarget, dirPath, useCache=False):
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        if not (useCache and fU.exists(filePath)):
            startTime = time.time()
            ok2 = fU.get(urlTarget, filePath)
            endTime = time.time()
            if ok2:
                logger.info(
                    "Fetched %s for resource file %s (status = %r) (%.4f seconds)",
                    urlTarget, filePath, ok2, endTime - startTime)
            else:
                logger.error(
                    "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)",
                    urlTarget, filePath, ok2, endTime - startTime)
        #
        return filePath

    def buildDescriptors(self):
        descrFilePath = self.getIndexFilePath()
        ccidxP = ChemCompIndexProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            useCache=True,
            molLimit=self.__molLimit,
            ccFileNamePrefix=self.__ccFileNamePrefix,
        )
        ok = ccidxP.testCache()
        if ok:
            ccIdList = ccidxP.getIdList()
            self.__descrD = self.__fetchDescriptors(ccIdList,
                                                    ccidxP,
                                                    chunkSize=self.__chunkSize)
            tS = datetime.datetime.now().isoformat()
            vS = datetime.datetime.now().strftime("%Y-%m-%d")
            self.__version = vS
            dD = {"created": tS, "version": vS, "smiles": self.__descrD}
            ok = self.__mU.doExport(descrFilePath, dD, fmt="json", indent=3)
            logger.info("Stored %s descriptors for %d components (status=%r) ",
                        descrFilePath, len(self.__descrD), ok)

    def updateDescriptors(self, useCache=True):

        ccidxP = ChemCompIndexProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            useCache=useCache,
            molLimit=None,
            ccFileNamePrefix=self.__ccFileNamePrefix,
        )
        ok = ccidxP.testCache()
        if ok:
            ccIdList = ccidxP.getIdList()
            curIdList = list(self.__descrD.keys())
            updIdList = list(set(ccIdList) - set(curIdList))
            if updIdList:
                logger.info(
                    "Updating Chemaxon descriptors for (%d) components",
                    len(updIdList))
                uD = self.__fetchDescriptors(updIdList,
                                             ccidxP,
                                             chunkSize=self.__chunkSize)
                self.__descrD.update(uD)
                descrFilePath = self.getIndexFilePath()
                tS = datetime.datetime.now().isoformat()
                vS = datetime.datetime.now().strftime("%Y-%m-%d")
                self.__version = vS
                dD = {"created": tS, "version": vS, "smiles": self.__descrD}
                ok = self.__mU.doExport(descrFilePath,
                                        dD,
                                        fmt="json",
                                        indent=3)
        #
        return ok

    def __fetchDescriptors(self, ccIdList, ccidxP, chunkSize=100):
        """Fetch transformed SMILES descriptors from the ChemAxon webservice.

            Args:
                ccIdList (list, str): chemical component identifier list
                ccidxP (object): instance of the ChemCompIndexProvider()
                chunksize (int, optional): number of SMILES per request. Defaults to 100.

            Returns:
                (dict): dictionary {<ccId>: [<transformed SMILES>, ...], ...}

        Example API parameter data:
                            {
                            "errorHandlingMode": "FAIL_ON_ERROR",
                            "inputParams": "smiles",
                            "outputParams": "smiles",
                            "structures": [
                                "CC(C)[C@H](N)C=O",
                                "CC[C@H](C)[C@H](N)C=O",
                                "CC(C)C[C@H](N)C=O"
                            ]
                            }

        Example query:
        curl -X POST "https://jchem-microservices.chemaxon.com/jwsio/rest-v1/molconvert/batch" -H "accept: */*"
               -H "Content-Type: application/json" -d "{ \"errorHandlingMode\": \"FAIL_ON_ERROR\", \"inputParams\": \"smiles\",
               \"outputParams\": \"mrv\", \"structures\": [ \"CC(C)[C@H](N)C=O\", \"CC[C@H](C)[C@H](N)C=O\", \"CC(C)C[C@H](N)C=O\" ]}"
        """
        descrD = {}
        smilesCcIdD = {}
        smilesD = {}
        for ccId in ccIdList:
            smiL = list(
                set(
                    ccidxP.getSMILES(ccId,
                                     smiTypeList=[
                                         "oe-iso-smiles", "oe-smiles",
                                         "cactvs-iso-smiles", "cactvs-smiles"
                                     ])))
            smilesCcIdD.setdefault(ccId, []).extend(smiL)
            for smi in smiL:
                smilesD.setdefault(smi, []).append(ccId)
        #
        logger.info("Translating (%d) SMILES for components (%d)",
                    len(smilesD), len(smilesCcIdD))
        # ----
        smiLL = [
            list(smilesD.keys())[i:i + chunkSize]
            for i in range(0, len(smilesD), chunkSize)
        ]
        # ---
        baseUrl = "https://jchem-microservices.chemaxon.com"
        endPoint = "jwsio/rest-v1/molconvert/batch"
        # hL = [("Accept", "application/json"), ("Content-Type", "application/json")]
        hD = {"Accept": "application/json", "Content-Type": "application/json"}
        try:
            pD = {
                "errorHandlingMode": "SKIP_ERROR",
                "inputParams": "smiles",
                "outputParams": "smiles"
            }
            #
            iCount = 0
            for smiL in smiLL:
                iCount += 1
                ureq = UrlRequestUtil()
                pD["structures"] = smiL
                logger.debug("pD %r", pD)
                rDL, retCode = ureq.postUnWrapped(
                    baseUrl,
                    endPoint,
                    pD,
                    headers=hD,
                    sendContentType="application/json",
                    returnContentType="application/json")
                logger.debug("API result (%r) %r", retCode, rDL)
                if rDL and len(rDL) == len(smiL):
                    for ii, rD in enumerate(rDL):
                        if "structure" in rD and "successful" in rD and rD[
                                "successful"]:
                            if smiL[ii] == rD["structure"]:
                                continue
                            for ccId in smilesD[smiL[ii]]:
                                if ccId in descrD and rD[
                                        "structure"] in descrD[ccId]:
                                    continue
                                if rD["structure"] in smilesCcIdD[ccId]:
                                    continue
                                descrD.setdefault(ccId,
                                                  []).append(rD["structure"])
                else:
                    logger.info("Chunk %d failed (%d)", iCount, len(rDL))
                if iCount % 10 == 0:
                    logger.info("Completed processing chunk (%d/%d)", iCount,
                                len(smiLL))

            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return descrD
class ChEMBLTargetProviderTests(unittest.TestCase):
    skipFull = True

    def setUp(self):
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__fastaPath = os.path.join(HERE, "test-output",
                                        "chembl-targets.fa")
        self.__taxonPath = os.path.join(HERE, "test-output",
                                        "chembl-targets-taxon.tdd")
        self.__dataPath = os.path.join(HERE, "test-data")
        self.__mU = MarshalUtil(workPath=self.__cachePath)

    def tearDown(self):
        pass

    def testFetchChEMBLTargets(self):
        try:
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=False)
            ok = ctP.testCache()
            self.assertTrue(ok)
            ok = ctP.exportFasta(self.__fastaPath,
                                 self.__taxonPath,
                                 addTaxonomy=False)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFetchActivityData(self):
        try:
            logger.info("MAX_LIMIT %r", Settings.Instance().MAX_LIMIT)  # pylint: disable=no-member
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=True)
            ok = ctP.testCache()
            self.assertTrue(ok)
            # P43088|CHEMBL1987|9606
            # P08243|uniprotId|CHEMBL3120|chemblId|9606|taxId
            tL = ["CHEMBL1987", "CHEMBL3120"]
            targetD = ctP.getActivityData(tL)
            ok = self.__mU.doExport(os.path.join(
                self.__cachePath, "ChEMBL-targets",
                "chembl-target-activity.json"),
                                    targetD,
                                    fmt="json",
                                    indent=3)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFetchMechanismData(self):
        oD = {}
        try:
            logger.info("MAX_LIMIT %r", Settings.Instance().MAX_LIMIT)  # pylint: disable=no-member
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=True)
            ok = ctP.testCache()
            self.assertTrue(ok)
            # P43088|CHEMBL1987|9606
            # P08243|uniprotId|CHEMBL3120|chemblId|9606|taxId
            tL = ["CHEMBL1987", "CHEMBL3120"]
            oD.update(ctP.getMechanismData(tL))
            #
            ok = self.__mU.doExport(os.path.join(
                self.__cachePath, "ChEMBL-targets",
                "chembl-target-mechanism.json"),
                                    oD,
                                    fmt="json",
                                    indent=3)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    #
    @unittest.skipIf(skipFull, "Very long test")
    def testFetchChEMBLTargetsWithTax(self):
        try:
            ctP = ChEMBLTargetProvider(cachePath=self.__cachePath,
                                       useCache=True)
            ok = ctP.testCache()
            self.assertTrue(ok)
            ok = ctP.exportFasta(self.__fastaPath,
                                 self.__taxonPath,
                                 addTaxonomy=True)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Пример #19
0
    def __writeModel(self, targetId, targetPath, fitFD, fitXyzMapD, fitAtomUnMappedL, matchObj, modelId, modelPath):
        """Write the chemical component model for the input chemical component Id and associated atom mapping and
        feature details --

            ComponentAtomDetails = namedtuple("ComponentAtomDetails", "index atNo name aType x y z fCharge")
            AlignAtomMap = namedtuple("AlignAtomMap", "refId refAtIdx refAtNo refAtName fitId fitAtIdx fitAtNo fitAtName")
            AlignAtomUnMapped = namedtuple("AlignAtomUnMapped", "fitId fitAtIdx fitAtNo fitAtType fitAtName fitAtFormalCharge x y z fitNeighbors")
        """
        try:
            unMappedTypeD = defaultdict(int)
            hAtomPrefix = "HEX"
            variantType = self.__getBuildVariant(targetId)
            #
            if not self.__testUnMappedProtonation(fitAtomUnMappedL):
                logger.info("Unmapped non-hydrogen atoms target %r model %r unMapped count (%d)", targetId, modelId, len(fitAtomUnMappedL))
                return False, variantType
            # Get atom partners for the unmapped atoms
            fitAtMapD = {}
            for refAtName, fAtTup in fitXyzMapD.items():
                fitAtMapD[fAtTup.atName] = refAtName
            if fitAtomUnMappedL:
                #  Check if neighbors are all mapped
                ok = True
                for fitUnTup in fitAtomUnMappedL:
                    for nAtName in fitUnTup.fitNeighbors:
                        if nAtName not in fitAtMapD:
                            ok = False
                            logger.info("Missing mapped neighbor for %r target %r model %r", nAtName, targetId, modelId)
                            break
                if not ok:
                    return False, variantType
                else:
                    logger.debug("%s match has unmapped protonation", modelId)
                    variantType = "tautomer_protomer"
            #
            #
            kList = ["xyz", "SMILES", "SMILES_STEREO", "InChI", "InChIKey"]
            for k in kList:
                if k not in fitFD:
                    logger.error("Fit feature dictionary for %s missing key %s", targetId, k)
                    return False, variantType
            # ------------
            dataContainer = DataContainer(modelId)
            #
            mU = MarshalUtil(workPath=self.__cachePath)
            myContainerList = mU.doImport(targetPath, fmt="mmcif")
            myContainer = myContainerList[0]
            dbName = myContainer.getName()
            if dbName.upper() != targetId.upper():
                logger.info("mismatch datablock (%r) and targetId (%r)", dbName, targetId)
            cObj = None
            if myContainer.exists("chem_comp"):
                cObj = myContainer.getObj("chem_comp")
            #
            #
            catName = "pdbx_chem_comp_model"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["id", "comp_id"]))
            #
            parentId = targetId.split("|")[0]
            wObj = dataContainer.getObj(catName)
            wObj.setValue(modelId, "id", 0)
            wObj.setValue(parentId, "comp_id", 0)
            #
            # --------  ---------
            catName = "pdbx_chem_comp_model_atom"
            if not dataContainer.exists(catName):
                dataContainer.append(
                    DataCategory(catName, attributeNameList=["model_id", "atom_id", "type_symbol", "charge", "model_Cartn_x", "model_Cartn_y", "model_Cartn_z", "ordinal_id"])
                )
            wObj = dataContainer.getObj(catName)
            #
            if myContainer.exists("chem_comp_atom"):
                cObj = myContainer.getObj("chem_comp_atom")
            #
            #  Only write the mapped atoms in case we are missing hydrogens in the mapping
            #
            jj = 0
            for ii in range(cObj.getRowCount()):
                atName = cObj.getValue("atom_id", ii)
                atType = cObj.getValue("type_symbol", ii)
                if atName not in fitXyzMapD:
                    unMappedTypeD[atType] += 1
                    continue
                fitXyz = fitXyzMapD[atName]
                #
                # fCharge = cObj.getValue("charge", ii)
                #
                wObj.setValue(modelId, "model_id", jj)
                wObj.setValue(atName, "atom_id", jj)
                wObj.setValue(atType, "type_symbol", jj)
                #
                wObj.setValue(fitXyz.atFormalCharge, "charge", jj)
                wObj.setValue("%.4f" % fitXyz.x, "model_Cartn_x", jj)
                wObj.setValue("%.4f" % fitXyz.y, "model_Cartn_y", jj)
                wObj.setValue("%.4f" % fitXyz.z, "model_Cartn_z", jj)
                wObj.setValue(jj + 1, "ordinal_id", jj)
                jj += 1
            #
            # Add the unmapped atoms ...
            # AlignAtomUnMapped = namedtuple("AlignAtomUnMapped", "fitId fitAtIdx fitAtNo fitAtType fitAtName fitNeighbors")
            ii = wObj.getRowCount()
            for jj, uTup in enumerate(fitAtomUnMappedL):
                refAtomName = hAtomPrefix + str(jj)
                wObj.setValue(modelId, "model_id", ii)
                wObj.setValue(refAtomName, "atom_id", ii)
                wObj.setValue(uTup.fitAtType, "type_symbol", ii)
                wObj.setValue(uTup.fitAtFormalCharge, "charge", ii)
                wObj.setValue("%.4f" % uTup.x, "model_Cartn_x", ii)
                wObj.setValue("%.4f" % uTup.y, "model_Cartn_y", ii)
                wObj.setValue("%.4f" % uTup.z, "model_Cartn_z", ii)
                wObj.setValue(ii + 1, "ordinal_id", ii)
            # --------  ---------
            catName = "pdbx_chem_comp_model_bond"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "atom_id_1", "atom_id_2", "value_order", "ordinal_id"]))
            wObj = dataContainer.getObj(catName)
            #
            if myContainer.exists("chem_comp_bond"):
                cObj = myContainer.getObj("chem_comp_bond")
            #
            jj = 0
            for ii in range(cObj.getRowCount()):
                at1 = cObj.getValue("atom_id_1", ii)
                if at1 not in fitXyzMapD:
                    continue
                at2 = cObj.getValue("atom_id_2", ii)
                if at2 not in fitXyzMapD:
                    continue
                bType = cObj.getValue("value_order", ii)
                #
                wObj.setValue(modelId, "model_id", jj)
                wObj.setValue(at1, "atom_id_1", jj)
                wObj.setValue(at2, "atom_id_2", jj)
                wObj.setValue(bType, "value_order", jj)
                wObj.setValue(jj + 1, "ordinal_id", jj)
                jj += 1
            #
            ii = wObj.getRowCount()
            for jj, uTup in enumerate(fitAtomUnMappedL):
                at1 = hAtomPrefix + str(jj)
                for nAt in uTup.fitNeighbors:
                    at2 = fitAtMapD[nAt]
                    wObj.setValue(modelId, "model_id", ii)
                    wObj.setValue(at1, "atom_id_1", ii)
                    wObj.setValue(at2, "atom_id_2", ii)
                    wObj.setValue("SING", "value_order", ii)
                    wObj.setValue(ii + 1, "ordinal_id", ii)

            # --------  ---------
            catName = "pdbx_chem_comp_model_descriptor"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "type", "descriptor"]))
            wObj = dataContainer.getObj(catName)
            #
            ii = 0
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("SMILES", "type", ii)
            wObj.setValue(fitFD["SMILES"], "descriptor", ii)
            ii += 1
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("SMILES_CANONICAL", "type", ii)
            wObj.setValue(fitFD["SMILES_STEREO"], "descriptor", ii)
            ii += 1
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("InChI", "type", ii)
            wObj.setValue(fitFD["InChI"], "descriptor", ii)
            ii += 1
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("InChIKey", "type", ii)
            wObj.setValue(fitFD["InChIKey"], "descriptor", ii)
            #
            # --------  ---------
            if matchObj.getIdentifier() is not None:
                catName = "pdbx_chem_comp_model_reference"
                if not dataContainer.exists(catName):
                    dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "db_name", "db_code"]))
                wObj = dataContainer.getObj(catName)
                ii = 0
                wObj.setValue(modelId, "model_id", ii)
                wObj.setValue("CSD", "db_name", ii)
                wObj.setValue(matchObj.getIdentifier(), "db_code", ii)
            #
            featureD = {}
            v = matchObj.getRFactor()
            vS = str(v)
            if v is not None and len(vS) > 0:
                featureD["r_factor"] = "%.3f" % float(v)
            #
            v = matchObj.getTemperature()
            vS = str(v)
            # remove string artifacts from temperature string ...
            if v is not None and len(vS) > 0:
                tV = vS.upper()
                try:
                    if tV.endswith("DEG.C"):
                        tV = tV.replace("AT", "")
                        tV = tV.replace("DEG.C", "")
                        tV = float(tV.strip())
                        tV = tV + 273.15
                    else:
                        tV = tV.replace("AT", "")
                        tV = tV.replace("K", "")
                        tV = float(tV.strip())
                    featureD["experiment_temperature"] = tV
                except Exception as e:
                    logger.exception("Temperature conversion fails for %s (%r) with %s", modelId, vS, tV)
            #
            v = matchObj.getCitationDOI()
            vS = str(v)
            if v is not None and len(vS) > 0:
                featureD["publication_doi"] = v
            #
            v = matchObj.getCsdVersion()
            vS = str(v)
            if v is not None and len(vS) > 0:
                featureD["csd_version"] = v
            #
            if matchObj.getRadiationSource() in ["Neutron"]:
                featureD["neutron_radiation_experiment"] = True
            if matchObj.getHasDisorder() in ["Y"]:
                featureD["has_disorder"] = True
            #
            if len(unMappedTypeD) == 1 and "H" in unMappedTypeD:
                logger.info("model %r heavy_atoms_only", modelId)
                featureD["heavy_atoms_only"] = True
            else:
                featureD["all_atoms_have_sites"] = True
            # --------  ---------
            catName = "pdbx_chem_comp_model_feature"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "feature_name", "feature_value"]))
            wObj = dataContainer.getObj(catName)
            #
            fKeyList = ["experiment_temperature", "publication_doi", "r_factor", "csd_version"]
            ii = 0
            for fKey in fKeyList:
                if fKey in featureD:
                    wObj.setValue(modelId, "model_id", ii)
                    wObj.setValue(fKey, "feature_name", ii)
                    wObj.setValue(str(featureD[fKey]), "feature_value", ii)
                    ii += 1

            #
            boolKeyList = ["has_disorder", "neutron_radiation_experiment", "heavy_atoms_only", "all_atoms_have_sites"]
            for fKey in boolKeyList:
                if fKey in featureD:
                    if featureD[fKey]:
                        wObj.setValue(modelId, "model_id", ii)
                        wObj.setValue(fKey, "feature_name", ii)
                        wObj.setValue("Y", "feature_value", ii)
                        ii += 1
            #

            if variantType:
                wObj.setValue(modelId, "model_id", ii)
                wObj.setValue(variantType + "_match", "feature_name", ii)
                wObj.setValue("Y", "feature_value", ii)
                ii += 1

            # --------  ---------
            catName = "pdbx_chem_comp_model_audit"
            if not dataContainer.exists(catName):
                dataContainer.append(DataCategory(catName, attributeNameList=["model_id", "action_type", "date"]))
            wObj = dataContainer.getObj(catName)
            #
            ii = 0
            wObj.setValue(modelId, "model_id", ii)
            wObj.setValue("Initial release", "action_type", ii)
            wObj.setValue(self.__getToday(), "date", ii)
            # wObj.setValue('RCSB', 'processing_site',  ii)
            # wObj.setValue('JDW', 'annotator', ii)
            # wObj.setValue('?', 'details', ii)
            #
            ok = mU.doExport(modelPath, [dataContainer], fmt="mmcif")
            return ok, variantType
        except Exception as e:
            logger.exception("Failing for %r %r with %s", targetId, targetPath, str(e))
        return False, ""
Пример #20
0
class PfamProvider(StashableBase):
    """Manage an index of Pfam identifier to description mappings."""
    def __init__(self, **kwargs):
        urlTargetPfam = kwargs.get(
            "urlTargetPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
        )
        urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz"
        self.__version = "34.0"
        dirName = "pfam"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, dirName)
        super(PfamProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB,
                                           dirPath, useCache)

        urlTargetMapPfam = kwargs.get(
            "urlTargetMapPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz"
        )
        urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz"
        self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam,
                                                     urlTargetMapPfamFB,
                                                     dirPath, useCache)

    def getVersion(self):
        return self.__version

    def getDescription(self, pfamId):
        """Return the description for the input Pfam identifier

        Args:
            pfamId (str): Pfam identifier

        Returns:
            str: text description of the Pfam domain
        """
        descr = None
        try:
            descr = self.__pfamD[pfamId]
        except Exception:
            pass
        return descr

    def getMapping(self, pdbId):
        """Return the list of Pfam domain assignments for the input PDB identifer along with
        residue level mapping information

        Args:
            pdbId (str): PDB identifier

        Returns:
            list: [{'pfamId': , 'authAsymId":  , 'authSeqBeg': , 'authSeqEnd': 'insertBeg': , 'insertEnd': }, {}, ]
        """
        mapL = []
        try:
            mapL = self.__pfamMapD[pdbId.upper()]
        except Exception:
            pass
        return mapL

    def testCache(self):
        # Check length ...
        logger.info("Length PfamD %d", len(self.__pfamD))
        return (len(self.__pfamD) > 19000) and (len(self.__pfamMapD) > 150000)

    #
    def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                       useCache):
        pfamD = {}
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(pfamDataPath):
            pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
            logger.debug("Pfam data length %d", len(pfamD))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetPfam,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
            ok = fU.get(urlTargetPfam, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
                ok = fU.get(urlTargetPfamFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            pfamD = self.__getPfamIndex(fp)
            ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
            logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                        ok)
            # ------
        #
        return pfamD

    def __getPfamIndex(self, filePath):
        """Parse annotation classifications
        #
        """
        pfamD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath,
                                  fmt="tdd",
                                  rowFormat="list",
                                  **encodingD)
        for row in rowL:
            try:
                pfamId = row[0].strip().upper()
                idCode = row[3].strip()
                descr = row[4].strip()
                pfamD[pfamId] = descr + " (" + idCode + ")"
            except Exception:
                pass
        #
        return pfamD

    def __rebuildMappingCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                              useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        pfamDataPath = os.path.join(dirPath, "pfam-mapping-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(pfamDataPath):
            pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
            logger.debug("Pfam mapping data length %d", len(pfamD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetPfam,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
            ok = fU.get(urlTargetPfam, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
                ok = fU.get(urlTargetPfamFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            pfamD = self.__getPfamMapping(fp)
            ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
            logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                        ok)
            # ------
        #
        return pfamD

    def __getPfamMapping(self, filePath):
        """Parse mapping data"""
        pFamMapD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath,
                                  fmt="tdd",
                                  rowFormat="list",
                                  **encodingD)
        for row in rowL:
            try:
                pdbId = row[2].strip().upper()
                pfamId = row[3].strip().upper()
                authAsymId = row[5].strip()
                authSeqBeg = int(row[6].strip())
                insertBeg = row[7].strip(
                ) if row[7].strip() != "NULL" else None
                authSeqEnd = int(row[8].strip())
                insertEnd = row[9].strip(
                ) if row[9].strip() != "NULL" else None
                pFamMapD.setdefault(pdbId, []).append({
                    "pfamId": pfamId,
                    "authAsymId": authAsymId,
                    "authSeqBeg": authSeqBeg,
                    "authSeqEnd": authSeqEnd,
                    "insertBeg": insertBeg,
                    "insertEnd": insertEnd,
                })
            except Exception as e:
                logger.exception("Failing with %r %s", row, str(e))
        #
        logger.info("Pfam mapping data for (%d) entries", len(pFamMapD))
        return pFamMapD
Пример #21
0
class CARDTargetProvider:
    """Accessors for CARD target assignments."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(self.__cachePath, "CARD-targets")
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__oD, self.__version = self.__reload(self.__dirPath, **kwargs)
        #

    def testCache(self, minCount=3000):
        if self.__oD and len(self.__oD) > minCount:
            return True
        else:
            return False

    def hasFeature(self, modelId):
        return modelId in self.__oD

    def getFeature(self, modelId, featureKey):
        try:
            return self.__oD[modelId][featureKey]
        except Exception:
            return None

    def getAssignmentVersion(self):
        return self.__version

    def getTargetDataPath(self):
        return os.path.join(self.__dirPath, "card-target-data.json")

    def getCofactorDataPath(self):
        return None

    def __reload(self, dirPath, **kwargs):
        oD = None
        version = None
        startTime = time.time()
        useCache = kwargs.get("useCache", True)
        #
        # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2")
        cardDumpUrl = kwargs.get("CARDDumpUrl",
                                 "https://card.mcmaster.ca/latest/data")
        ok = False
        fU = FileUtil()
        cardDumpFileName = "card-data.tar.bz2"
        cardDumpPath = os.path.join(dirPath, cardDumpFileName)
        cardDumpDirPath = os.path.join(dirPath, "dump")
        #
        fU.mkdir(dirPath)
        cardDataPath = os.path.join(dirPath, "card-select-data.json")
        #
        logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath)
        if useCache and self.__mU.exists(cardDataPath):
            qD = self.__mU.doImport(cardDataPath, fmt="json")
            version = qD["version"]
            oD = qD["data"]
        else:
            logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath)
            ok = fU.get(cardDumpUrl, cardDumpPath)
            fU.mkdir(cardDumpDirPath)
            fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath)
            fU.unbundleTarfile(os.path.join(cardDumpDirPath,
                                            cardDumpFileName[:-4]),
                               dirPath=cardDumpDirPath)
            logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
            oD, version = self.__parseCardData(
                os.path.join(cardDumpDirPath, "card.json"))
            tS = datetime.datetime.now().isoformat()
            qD = {"version": version, "created": tS, "data": oD}
            oD = qD["data"]
            ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3)
            logger.info("Export CARD data (%d) status %r", len(oD), ok)
        # ---
        return oD, version

    def exportCardFasta(self, fastaPath, taxonPath):
        ok = self.__exportCardFasta(fastaPath, taxonPath, self.__oD)
        return ok

    def __exportCardFasta(self, fastaPath, taxonPath, cardD):
        """Export a CARD sequence target fasta file

        Args:
            fastaPath (str): fasta output file path
            cardD (dict): card selected data dictionary

        Returns:
            (bool): True for success or False otherwise
        """
        sD = {}
        taxonL = []
        try:
            for modelId, tD in cardD.items():
                modelBitScore = None
                # aroAcc = tD["accession"]
                aroId = tD["id"]
                if "sequences" not in tD:
                    continue
                modelBitScore = tD[
                    "modelBitScore"] if "modelBitScore" in tD else None
                for qD in tD["sequences"]:
                    sId = qD["seqId"]
                    seq = qD["sequence"]
                    taxId = qD["taxId"]
                    cD = {
                        "sequence": seq,
                        "modelId": modelId,
                        "aroId": aroId,
                        "seqId": sId,
                        "taxId": taxId
                    }
                    cD["bitScore"] = modelBitScore if modelBitScore else "-1.0"
                    #
                    cId = ""
                    cL = []
                    for k, v in cD.items():
                        if k in ["sequence"]:
                            continue
                        cL.append(str(v))
                        cL.append(str(k))
                    cId = "|".join(cL)
                    sD[cId] = cD
                    taxonL.append("%s\t%s" % (cId, taxId))

            ok = self.__mU.doExport(fastaPath,
                                    sD,
                                    fmt="fasta",
                                    makeComment=True)
            logger.info("Export CARD fasta (%d) status %r", len(sD), ok)
            ok = self.__mU.doExport(taxonPath, taxonL, fmt="list")
            logger.info("Export Taxon (%d) status %r", len(taxonL), ok)
        except Exception as e:
            logger.exception("Failing for model %r tD %r with %s", modelId, tD,
                             str(e))
        return ok

    def __parseCardData(self, filePath):
        """Parse CARD target data

        Args:
            filePath (str): card json data file

        Returns:
            (dict, string): card selected data dictionary, card version string
        """
        try:
            oD = {}
            version = None
            cD = self.__mU.doImport(filePath, fmt="json")
            logger.info("CARD model count (%d)", len(cD))
            for modelId, mD in cD.items():
                if modelId.startswith("_"):
                    if modelId == "_version":
                        version = mD
                    continue
                oD[modelId] = {}
                for kTup in [
                    ("ARO_accession", "accession"),
                    ("ARO_id", "id"),
                    ("ARO_name", "name"),
                    ("ARO_description", "descr"),
                    ("model_name", "modelName"),
                    ("model_type", "modelType"),
                ]:
                    if kTup[0] in mD:
                        oD[modelId][kTup[1]] = mD[kTup[0]]

                try:
                    if "model_sequences" in mD:
                        for seqId, tD in mD["model_sequences"][
                                "sequence"].items():
                            oD[modelId].setdefault("sequences", []).append({
                                "seqId":
                                seqId,
                                "sequence":
                                tD["protein_sequence"]["sequence"],
                                "taxId":
                                tD["NCBI_taxonomy"]["NCBI_taxonomy_id"]
                            })
                except Exception as e:
                    logger.exception("Failing with %s", str(e))

                try:
                    if "model_param" in mD and "blastp_bit_score" in mD[
                            "model_param"] and "param_value" in mD[
                                "model_param"]["blastp_bit_score"]:
                        oD[modelId]["modelBitScore"] = mD["model_param"][
                            "blastp_bit_score"]["param_value"]

                except Exception as e:
                    logger.exception("Failing with %s", str(e))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return oD, version
Пример #22
0
def main():
    parser = argparse.ArgumentParser()
    #
    parser.add_argument("--mol_list_path",
                        default=None,
                        help="Molecule file list path")
    parser.add_argument("--result_path",
                        default=None,
                        help="Molecule file list path")
    parser.add_argument("--search_type",
                        default=None,
                        help="Search type (similarity|substructure)")
    parser.add_argument("--start_record", default=None, help="Starting record")
    parser.add_argument("--end_record", default=None, help="End record")
    parser.add_argument("--csdhome",
                        default=None,
                        help="Path to the CSD release (path to CSD_202x)")
    parser.add_argument("--python_lib_path",
                        default=None,
                        help="Path to Python library")
    parser.add_argument("--python_version",
                        default=None,
                        help="Python library version (default: 3.7)")
    parser.add_argument(
        "--hit_list_path",
        default=None,
        help="Path to list of molecule identifers with search results")
    #
    args = parser.parse_args()
    #
    try:
        pyLib = args.python_lib_path if args.python_lib_path else os.path.join(
            os.environ["PYENV_ROOT"], "versions", "3.7.9", "lib")
        pyVer = args.python_version if args.python_version else "3.7"

        csdHome = args.csdhome
        molFilePath = args.mol_list_path
        resultPath = args.result_path
        searchType = args.search_type
        startRecord = args.start_record
        endRecord = args.end_record
        hitListPath = args.hit_list_path
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    #
    try:
        os.environ["CSDHOME"] = csdHome
        os.environ[
            "LD_LIBRARY_PATH"] = "%s:%s/python%s/site-packages/ccdc/_lib:$LD_LIBRARY_PATH" % (
                pyLib, pyLib, pyVer)
        os.environ[
            "DYLD_LIBRARY_PATH"] = "%s/python%s/site-packages/ccdc/_lib" % (
                pyLib, pyVer)
        os.environ[
            "DYLD_FRAMEWORK_PATH"] = "%s/python%s/site-packages/ccdc/_lib" % (
                pyLib, pyVer)

        logger.info("Using CSDHOME %s", os.environ["CSDHOME"])
        logger.info("Using DYLD_LIBRARY_PATH %s",
                    os.environ["DYLD_LIBRARY_PATH"])
        logger.info("Using DYLD_FRAMEWORK_PATH %s",
                    os.environ["DYLD_FRAMEWORK_PATH"])

        from rcsb.utils.ccdc.CcdcSearch import CcdcSearch  # pylint: disable=import-outside-toplevel

        ccdcS = CcdcSearch(verbose=True)
        pL = ccdcS.getList(molFilePath,
                           startRecord=startRecord,
                           endRecord=endRecord)
        logger.info("Search file %s record length %r", molFilePath,
                    len(pL) if pL else [])
        #
        hitL = []
        for ii, queryTargetPath in enumerate(pL, 1):
            _, fn = os.path.split(queryTargetPath)
            queryTargetId, _ = os.path.splitext(fn)
            #
            logger.info("(%d/%d) Start search for %r %r", ii, len(pL),
                        queryTargetId, queryTargetPath)
            numHits = ccdcS.search(queryTargetId,
                                   queryTargetPath,
                                   resultPath,
                                   searchType=searchType)
            if numHits:
                hitL.append(queryTargetId)
        logger.info("%d searches completed - matched %d", len(pL), len(hitL))
        if hitListPath:
            mU = MarshalUtil()
            ok = mU.doExport(hitListPath, hitL, fmt="list")
            logger.info("Wrote hit list (%r) to %s", ok, hitListPath)
    except Exception as e:
        logger.exception("Failing with %s", str(e))
class DrugBankTargetCofactorProvider(StashableBase):
    """Accessors for DrugBank target cofactors."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__useCache = kwargs.get("useCache", True)
        self.__fmt = kwargs.get("fmt", "pickle")
        self.__dirName = "DrugBank-cofactors"
        super(DrugBankTargetCofactorProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, self.__useCache, self.__fmt)
        #

    def testCache(self, minCount=590):
        logger.info(
            "DrugBank feature count %d",
            len(self.__fD["cofactors"]) if "cofactors" in self.__fD else 0)
        if self.__fD and "cofactors" in self.__fD and len(
                self.__fD["cofactors"]) > minCount:
            return True
        else:
            return False

    def hasTarget(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["cofactors"]

    def getTargets(self, rcsbEntityId):
        try:
            return self.__fD["cofactors"][rcsbEntityId.upper()]
        except Exception:
            return []

    def __getCofactorDataPath(self, fmt="json"):
        fExt = "json" if fmt == "json" else "pic"
        return os.path.join(self.__dirPath, "drugbank-cofactor-data.%s" % fExt)

    def reload(self):
        self.__fD = self.__reload(self.__dirPath,
                                  useCache=True,
                                  fmt=self.__fmt)
        return True

    def __reload(self, dirPath, useCache, fmt):
        startTime = time.time()
        fD = {}

        ok = False
        cofactorPath = self.__getCofactorDataPath(fmt=fmt)
        #
        logger.info("useCache %r featurePath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt=fmt)
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return fD

    def buildCofactorList(self,
                          sequenceMatchFilePath,
                          crmpObj=None,
                          lnmpObj=None):
        """Build target cofactor list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path
            crmpObj (obj, optional): instance of ChemRefMappingProviderObj(). Defaults to None
            lnmpObj (obj, optional): instance of LigandNeighborMappingProviderObj(). Defaults to None.

        Returns:
            bool: True for success or False otherwise
        """
        rDL = []
        dbP = DrugBankTargetProvider(cachePath=self.__cachePath, useCache=True)
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        #
        provenanceSource = "DrugBank"
        refScheme = "PDB entity"
        assignVersion = str(dbP.getAssignmentVersion())
        for queryId, matchDL in mD.items():
            qCmtD = self.__decodeComment(queryId)
            unpId = qCmtD["uniprotId"]
            queryTaxId = qCmtD["taxId"] if "taxId" in qCmtD else None
            if not dbP.hasCofactor(unpId) or queryTaxId == "-1":
                logger.info("Skipping target %r", unpId)
                continue
            #
            # --
            chemCompNeighborsD = {}
            if lnmpObj:
                for matchD in matchDL:
                    tCmtD = self.__decodeComment(matchD["target"])
                    entryId = tCmtD["entityId"].split("_")[0]
                    entityId = tCmtD["entityId"].split("_")[1]
                    rcsbEntityId = entryId + "_" + entityId
                    chemCompIdList = lnmpObj.getLigandNeighbors(rcsbEntityId)
                    chemCompNeighborsD.update(
                        {k: True
                         for k in chemCompIdList})
            # --
            #
            for matchD in matchDL:
                tCmtD = self.__decodeComment(matchD["target"])
                entryId = tCmtD["entityId"].split("_")[0]
                entityId = tCmtD["entityId"].split("_")[1]
                # --
                dbDL = dbP.getCofactors(unpId)
                # --
                cfDL = []
                for dbD in dbDL:
                    cfD = {}
                    cfD["cofactor_id"] = dbD["drugbank_id"]
                    cfD["molecule_name"] = dbD["name"]
                    cfD["target_name"] = dbD["target_name"]
                    # cfD["description"] = dbD["description"]
                    cfD["moa"] = dbD["moa"]
                    # cfD["pharmacology"] = dbD["pharmacology"]
                    cfD["inchi_key"] = dbD["inchi_key"]
                    cfD["smiles"] = dbD["smiles"]
                    cfD["pubmed_ids"] = dbD["pubmed_ids"]
                    cfD = self.__addLocalIds(cfD, crmpObj)
                    #
                    if "chem_comp_id" in cfD and cfD[
                            "chem_comp_id"] in chemCompNeighborsD:
                        cfD["neighbor_in_pdb"] = "Y"
                    else:
                        cfD["neighbor_in_pdb"] = "N"
                    #
                    cfDL.append(cfD)
                # ---
                queryName = cfDL[0][
                    "target_name"] if cfDL and "target_name" in cfDL[
                        0] else None
                # ---
                # aligned_target.entity_beg_seq_id (current target is PDB entity in json)
                # aligned_target.target_beg_seq_id (current query is target seq in json)
                # aligned_target.length
                fpL = []
                if "alignedRegions" in matchD:
                    fpL = [{
                        "entity_beg_seq_id": arD["targetBegin"],
                        "target_beg_seq_id": arD["queryBegin"],
                        "length": arD["targetEnd"] - arD["targetBegin"],
                    } for arD in matchD["alignedRegions"]]
                else:
                    fpL = [{
                        "entity_beg_seq_id": matchD["targetBegin"],
                        "target_beg_seq_id": matchD["queryBegin"],
                        "length": matchD["alignLen"],
                    }]
                # ---
                rD = {
                    "entry_id":
                    entryId,
                    "entity_id":
                    entityId,
                    "query_uniprot_id":
                    unpId,
                    "query_id":
                    unpId,
                    "query_id_type":
                    "DrugBank",
                    "query_name":
                    queryName,
                    "provenance_source":
                    provenanceSource,
                    "reference_scheme":
                    refScheme,
                    "assignment_version":
                    assignVersion,
                    "query_taxonomy_id":
                    int(queryTaxId) if queryTaxId else None,
                    "target_taxonomy_id":
                    int(matchD["targetTaxId"])
                    if "targetTaxId" in matchD else None,
                    "aligned_target":
                    fpL,
                    "taxonomy_match_status":
                    matchD["taxonomyMatchStatus"]
                    if "taxonomyMatchStatus" in matchD else None,
                    "lca_taxonomy_id":
                    matchD["lcaTaxId"] if "lcaTaxId" in matchD else None,
                    "lca_taxonomy_name":
                    matchD["lcaTaxName"] if "lcaTaxName" in matchD else None,
                    "lca_taxonomy_rank":
                    matchD["lcaRank"] if "lcaRank" in matchD else None,
                    "cofactors":
                    cfDL,
                }
                rDL.append(rD)
        #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        fp = self.__getCofactorDataPath(fmt=self.__fmt)
        tS = datetime.datetime.now().isoformat()
        # vS = datetime.datetime.now().strftime("%Y-%m-%d")
        vS = assignVersion
        ok = self.__mU.doExport(fp, {
            "version": vS,
            "created": tS,
            "cofactors": qD
        },
                                fmt=self.__fmt,
                                indent=3)
        return ok

    def __addLocalIds(self, cfD, crmpOb=None):
        #
        if crmpOb:
            localIdL = crmpOb.getLocalIds("DRUGBANK", cfD["cofactor_id"])
            if localIdL:
                localId = localIdL[0]
                if localId.startswith("PRD_"):
                    cfD["prd_id"] = localId
                else:
                    cfD["chem_comp_id"] = localId
        return cfD

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD
class EntityPolymerExtractorTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(EntityPolymerExtractorTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        self.__taxonomyDataPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR",
                             sectionName=configName))
        #
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__exdbCacheDirPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        self.__mU = MarshalUtil()
        self.__entryLimitTest = 18
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testExtractEntityPolymers(self):
        """Test case - extract entity polymer info"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs,
                                         entryLimit=self.__entryLimitTest)
            eCount = epe.getEntryCount()
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerFeatures(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            eCount = epe.getEntryCount()
            logger.info("Entry count %d", eCount)
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Ref seq count %d", len(unpL))
            self.assertGreaterEqual(len(unpL), 1)
            #
            for entryId in ["3RER"]:
                for entityId in ["1"]:
                    uL = epe.getEntityRefSeqAccessions("UNP", entryId,
                                                       entityId)
                    logger.info("UNP for %s %s %r", entryId, entityId, uL)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testTaxonomyReadCache(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            #
            obsL = []
            tD = epe.getOrigTaxons()
            logger.info("Taxons %d", len(tD))

            tU = TaxonomyProvider(taxDirPath=self.__taxonomyDataPath,
                                  useCache=True)
            #
            for entryId, taxIdL in tD.items():
                for entityId, iTaxId in taxIdL:
                    # logger.info("entryId %r entityId %r taxId %r" % (entryId, entityId, taxId))
                    mTaxId = tU.getMergedTaxId(iTaxId)
                    if iTaxId != mTaxId:
                        obsL.append({
                            "entryId": entryId,
                            "entityId": entityId,
                            "taxId": iTaxId,
                            "replaceTaxId": mTaxId
                        })
            logger.info("Obsolete list length %d", len(obsL))
            self.__mU.doExport(os.path.join(self.__workPath,
                                            "obsolete-taxons.json"),
                               obsL,
                               fmt="json",
                               indent=3)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerReadCache(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            cD = epe.countRefSeqAccessions("UNP")
            self.assertGreaterEqual(len(cD), 2)
            logger.info("UNP reference sequences per entity %r",
                        dict(sorted(cD.items())))
            logger.info("Reference sequences per entity %r",
                        dict(sorted(epe.countRefSeqAccessionAny().items())))
            logger.info("Reference sequences per ref db %r",
                        dict(sorted(epe.countRefSeqAccessionDbType().items())))
            #
            ok = epe.checkRefSeqAlignRange("UNP")
            self.assertTrue(ok)
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Unique UNP reference sequences %d", len(unpL))
            self.assertTrue(ok)
            tD = epe.getUniqueTaxons()
            logger.info("Unique taxons %d", len(tD))
            tD = epe.countRefSeqAccessionByTaxon("UNP")
            logger.info("Unique taxons %d", len(tD))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Пример #25
0
class ScanRepoUtil(object):
    """Tools for for scanning repositories and collecting coverage and type data information."""
    def __init__(self,
                 cfgOb,
                 attributeDataTypeD=None,
                 numProc=4,
                 chunkSize=15,
                 fileLimit=None,
                 maxStepLength=2000,
                 workPath=None):
        """
        Args:
            cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil)

            attributeDataTypeD
            dictPath (str): Path to supporting data dictionary

            numProc (int, optional): Number of parallel worker processes used.
            chunkSize (int, optional): Size of files processed in a single multi-proc process
            fileLimit (int, optional): maximum file scanned or None for no limit
            mockTopPath (str, optional): Path to directory containing mock repositories or None
            maxStepLength (int, optional): maximum number of multi-proc runs to perform
        """
        #
        self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {}
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__fileLimit = fileLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

        self.__workPath = workPath
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__rpP = RepositoryProvider(self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)

    def scanContentType(self,
                        contentType,
                        mergeContentTypes=None,
                        scanType="full",
                        inputPathList=None,
                        scanDataFilePath=None,
                        failedFilePath=None,
                        saveInputFileListPath=None):
        """Driver method for repository scan operation

        Args:
            contentType (str):  one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx'
            scanType (str, optional): 'full' [or 'incr' to be supported]
            inputPathList (list, optional):  list of input file paths to scan
            scanDataFilePath (str, optional): file path for serialized scan data (Pickle format)
            failedFilePath (str, optional): file path for list of files that fail scanning operation
            saveInputFileListPath str, optional): Path to store file path list that is scanned

        Returns:
            bool: True for success or False otherwise

        """
        try:
            startTime = self.__begin(message="scanning operation")
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
            #
            if saveInputFileListPath:
                self.__mU.doExport(saveInputFileListPath,
                                   self.__rpP.getLocatorPaths(locatorObjList),
                                   fmt="list")
                logger.debug("Saving %d paths in %s", len(locatorObjList),
                             saveInputFileListPath)
            #
            optD = {}
            optD["contentType"] = contentType
            optD["logSize"] = True
            optD["scanType"] = scanType
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            numProc = self.__numProc
            chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len(
                locatorObjList) else 0
            #
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numPaths = len(locatorObjList)
            logger.debug("Processing %d total paths", numPaths)
            numProc = min(numProc, numPaths)
            maxStepLength = self.__maxStepLength
            if numPaths > maxStepLength:
                numLists = int(numPaths / maxStepLength)
                subLists = [
                    locatorObjList[i::numLists] for i in range(numLists)
                ]
            else:
                subLists = [locatorObjList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            numResults = 1
            failList = []
            retLists = [[] for ii in range(numResults)]
            diagList = []
            for ii, subList in enumerate(subLists):
                logger.info("Running outer subtask %d of %d length %d", ii + 1,
                            len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="scanWorker")
                ok, failListT, retListsT, diagListT = mpu.runMulti(
                    dataList=subList,
                    numProc=numProc,
                    numResults=numResults,
                    chunkSize=chunkSize)
                failList.extend(failListT)
                # retLists is a list of lists -
                logger.debug("status %r fail len %r ret len %r", ok,
                             len(failListT), len(retListsT))
                for jj in range(numResults):
                    retLists[jj].extend(retListsT[jj])
                diagList.extend(diagListT)
            logger.debug("Scan failed path list %r", failList)
            logger.debug(
                "Scan path list success length %d load list failed length %d",
                len(locatorObjList), len(failList))
            logger.debug("Returned metadata length %r", len(retLists[0]))
            #
            if failedFilePath and failList:
                wOk = self.__mU.doExport(failedFilePath,
                                         self.__rpP.getLocatorPaths(failList),
                                         fmt="list")
                logger.debug("Writing scan failure path list to %s status %r",
                             failedFilePath, wOk)
            #
            if scanType == "incr":
                scanDataD = self.__mU.doImport(scanDataFilePath,
                                               fmt="pickle",
                                               default=None)
                logger.debug("Imported scan data with keys %r",
                             list(scanDataD.keys()))
            else:
                scanDataD = {}
            #
            if scanDataFilePath and retLists[0]:
                for ssTup in retLists[0]:
                    cId = ssTup.containerId
                    if scanType == "full" and cId in scanDataD:
                        logger.error("Duplicate container id %s in %r and %r",
                                     cId, ssTup.fromPath,
                                     scanDataD[cId].fromPath)
                    #
                    scanDataD[cId] = ssTup

                ok = self.__mU.doExport(scanDataFilePath,
                                        scanDataD,
                                        fmt="pickle")
                tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
                ok = tscanDataD == scanDataD

            self.__end(startTime, "scanning operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False

    def evalScan(self,
                 scanDataFilePath,
                 evalJsonFilePath,
                 evalType="data_type"):

        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        if evalType in ["data_type"]:
            rD = self.__evalScanDataType(scanDataD)
        elif evalType in ["data_coverage"]:
            rD, _ = self.__evalScanDataCoverage(scanDataD)
        else:
            logger.debug("Unknown evalType %r", evalType)
        ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json")

        return ok

    def evalScanItem(self, scanDataFilePath, evalFilePath):
        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        _, cL = self.__evalScanDataCoverage(scanDataD)
        ok = self.__mU.doExport(evalFilePath, cL, fmt="list")
        return ok

    def __evalScanDataType(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """
        # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "minWidth": svTup.minWidth,
                            "maxWidth": svTup.maxWidth,
                            "minPrec": svTup.minPrec,
                            "maxPrec": svTup.maxPrec,
                            "count": 1
                        }
                        continue
                    sD[catName][svTup.atName]["minWidth"] = min(
                        sD[catName][svTup.atName]["minWidth"], svTup.minWidth)
                    sD[catName][svTup.atName]["maxWidth"] = max(
                        sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth)
                    sD[catName][svTup.atName]["minPrec"] = min(
                        sD[catName][svTup.atName]["minPrec"], svTup.minPrec)
                    sD[catName][svTup.atName]["maxPrec"] = max(
                        sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec)
                    sD[catName][svTup.atName]["count"] += 1
        return sD

    def __evalScanDataCoverage(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """

        # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "count": 0,
                            "instances": []
                        }
                    sD[catName][svTup.atName]["instances"].append(
                        svTup.containerId)
                    sD[catName][svTup.atName]["count"] += 1
        cL = []
        for catName, aD in sD.items():
            for atName, tD in aD.items():
                cL.append("%s\t%s" %
                          ("_" + catName + "." + atName, tD["count"]))
        return sD, cL

    def scanWorker(self, dataList, procName, optionsD, workingDir):
        """Multi-proc worker method for scanning repository data files-"""
        try:
            _ = workingDir
            startTime = self.__begin(message=procName)
            # Recover common options

            scanType = optionsD["scanType"]
            contentType = optionsD["contentType"]
            #
            successList = []
            retList = []

            containerList = self.__getContainerList(dataList)
            for container in containerList:
                ret = self.__scanContainer(container)
                successList.append(ret.fromPath)
                retList.append(ret)
            #

            logger.debug(
                "%s scanType %s contentType %spathlist length %d containerList length %d",
                procName, scanType, contentType, len(dataList),
                len(containerList))

            ok = len(successList) == len(dataList)
            #
            self.__end(startTime, procName + " with status " + str(ok))
            return successList, retList, []

        except Exception as e:
            logger.error("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __getContainerList(self, locatorObjList):
        """"""
        utcnow = datetime.datetime.utcnow()
        ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S")
        cL = []
        myContainerList = self.__rpP.getContainerList(locatorObjList)
        for loc in locatorObjList:
            myContainerList = self.__rpP.getContainerList([loc])
            lPathL = self.__rpP.getLocatorPaths([loc])
            for cA in myContainerList:
                dc = DataCategory("rcsb_load_status",
                                  ["name", "load_date", "locator"],
                                  [[cA.getName(), ts, lPathL[0]]])
                logger.debug("data category %r", dc)
                cA.append(dc)
                cL.append(cA)
        return cL

    def __scanContainer(self, container):
        """Scan the input container for

        Get the file name -
        """
        cName = container.getName()
        loadStatusObj = container.getObj("rcsb_load_status")
        lName = loadStatusObj.getValue(attributeName="name", rowIndex=0)
        lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0)
        lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0)
        #
        oD = {}
        for objName in container.getObjNameList():
            if objName == "rcsb_load_status":
                continue
            obj = container.getObj(objName)
            afD = self.__attributeDataTypeD[
                objName] if objName in self.__attributeDataTypeD else {}
            atNameList = obj.getAttributeList()
            wMin = {atName: 100000 for atName in atNameList}
            wMax = {atName: -1 for atName in atNameList}
            pMin = {atName: 100000 for atName in atNameList}
            pMax = {atName: -1 for atName in atNameList}
            for row in obj.getRowList():
                for ii, val in enumerate(row):
                    valLen = len(val)
                    if (valLen == 0) or (val == "?") or (val == "."):
                        continue
                    atName = atNameList[ii]
                    wMin[atName] = min(wMin[atName], valLen)
                    wMax[atName] = max(wMax[atName], valLen)
                    if atName in afD and afD[atName] == "float":
                        vPrec = 0
                        try:
                            fields = val.split(".")
                            vPrec = len(fields[1])
                            pMin[atName] = min(pMin[atName], vPrec)
                            pMax[atName] = max(pMax[atName], vPrec)
                        except Exception as e:
                            logger.debug("Failed to process float %s %r %r %s",
                                         atName, val, vPrec, str(e))
                            pMin[atName] = 0
                            pMax[atName] = 0
                        logger.debug("Got float for %s %r %r", atName, val,
                                     vPrec)
                    else:
                        pMin[atName] = 0
                        pMax[atName] = 0

            # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec
            oD[objName] = [
                ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN],
                          pMax[atN]) for atN in wMax if wMax[atN] != -1
            ]
        # ScanSummary containerId, fromPath, scanCategoryDict
        #
        ret = ScanSummary(lName, lFilePath, lDate, oD)
        #
        return ret

    def __begin(self, message=""):
        startTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting %s at %s", message, ts)
        return startTime

    def __end(self, startTime, message=""):
        endTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        delta = endTime - startTime
        logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)