def __getChemCompPathList(self, topRepoPath, numProc=8):
     """Get the path list for the chemical component definition repository
     """
     ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
     logger.debug("Starting at %s", ts)
     startTime = time.time()
     pathList = []
     try:
         dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
         dataList = [a for a in dataS]
         optD = {}
         optD["topRepoPath"] = topRepoPath
         mpu = MultiProcUtil(verbose=self.__verbose)
         mpu.setOptions(optionsD=optD)
         mpu.set(workerObj=self, workerMethod="_chemCompPathWorker")
         _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                          numProc=numProc,
                                          numResults=1)
         pathList = retLists[0]
         endTime0 = time.time()
         logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                      endTime0 - startTime)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return self.__applyFileLimit(pathList)
示例#2
0
    def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True):
        rcD = {}
        cD = self.__getSearchResults()
        idList = list(cD.keys())
        # ---
        mpu = MultiProcUtil(verbose=True)
        mpu.setWorkingDir(self.__cachePath)
        mpu.setOptions(optionsD={
            "resultPath": self.__cachePath,
            "cD": cD,
            "useCache": useCache
        })
        mpu.set(workerObj=self, workerMethod="fetchDataWorker")

        ok, failList, resultList, _ = mpu.runMulti(dataList=idList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        logger.info("Run ended with status %r success count %d failures %r",
                    ok, len(resultList[0]), len(failList))
        for rTup in resultList[0]:
            rcD[rTup[0]] = rTup[1]
        # ---
        ok = self.storeResultIndex(rcD)
        logger.info("Final match result (w/sdf and metadata) (%d/%d)",
                    len(rcD), len(cD))
        return True
    def __descriptorQueryMulti(self, queryPairList, queryType, matchOpts, numProc=2, chunkSize=50):
        """Internal method to invoke descriptor query in multiprocess mode

        Args:
            queryPairList (list): [(identifer, descriptor), ... ]
            queryType (str): SMILES|InChI
            matchOpts (str): match criteria (e.g., graph-strict)
            numProc (int, optional): number of multiprocess cores. Defaults to 2.
            chunkSize (int, optional): multiprocess batch size. Defaults to 50.

        Returns:
            (list): [BatchResults(), ...]
        """
        logger.info("Input %r query length %d using %s numProc %d", queryType, len(queryPairList), matchOpts, numProc)
        rWorker = BatchChemSearchWorker(self.__ccsw)
        mpu = MultiProcUtil(verbose=True)
        optD = {"matchOpts": matchOpts, "queryType": queryType}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="searchDescriptorList")
        ok, failList, resultList, _ = mpu.runMulti(dataList=queryPairList, numProc=numProc, numResults=1, chunkSize=chunkSize)
        if failList:
            logger.debug("Search completed with failures (%d)", len(failList))
        logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0]))
        #
        rL = []
        for tup in resultList[0]:
            for mr in tup[3]:
                rL.append(BatchResults(queryId=tup[0], query=tup[1], queryType=tup[2], matchOpts=matchOpts, ccId=mr.ccId, fpScore=mr.fpScore))
        #
        logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(rL))
        #
        return rL
示例#4
0
    def runSearch(self,
                  molFilePathList,
                  resultPath,
                  searchType="similarity",
                  numProc=4,
                  chunkSize=10):
        """"""
        logger.info("Starting with molfile path list length %d",
                    len(molFilePathList))
        try:
            pU = CcdcSearchWorker(verbose=self.__verbose)
            mpu = MultiProcUtil(verbose=True)
            # mpu.setWorkingDir(resultPath)
            mpu.setOptions(optionsD={
                "resultPath": resultPath,
                "searchType": searchType
            })
            #
            mpu.set(workerObj=pU, workerMethod="search")

            ok, failList, resultList, _ = mpu.runMulti(
                dataList=molFilePathList,
                numProc=numProc,
                numResults=1,
                chunkSize=chunkSize)
            logger.info("run ended status %r success count %d failures %r", ok,
                        len(resultList[0]), len(failList))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
示例#5
0
    def load(self, idList, exportPath=None, numProc=1, chunkSize=5):
        """Fetch and load reference data for the input list of PubChem compound codes.

        Args:
            idList (list): PubChem ID codes
            exportPath (str, optional): store raw fetched data in this path. Defaults to None.
            numProc (int, optional): number of processor to use. Defaults to 1.
            chunkSize (int, optional): chunk size between data store updates. Defaults to 5.


        Returns:
            (bool, list): status flag, list of failed identifiers

        """
        logger.info("Length starting list is %d", len(idList))
        optD = {"chunkSize": chunkSize, "exportPath": exportPath}
        rWorker = PubChemDataUpdateWorker(self.__cfgOb)
        if numProc > 1:
            mpu = MultiProcUtil(verbose=True)
            mpu.setOptions(optD)
            mpu.set(workerObj=rWorker, workerMethod="updateList")
            ok, failList, resultList, _ = mpu.runMulti(dataList=idList,
                                                       numProc=numProc,
                                                       numResults=2,
                                                       chunkSize=chunkSize)
            logger.info("Multi-proc %r failures %r result lengths %r %r", ok,
                        len(failList), len(resultList[0]), len(resultList[1]))
        else:
            successList, _, _, _ = rWorker.updateList(idList, "SingleProc",
                                                      optD, None)
            failList = list(set(idList) - set(successList))
            ok = len(failList) == 0
            logger.info("Single-proc status %r failures %r", ok, len(failList))
        #
        return ok, failList
 def __getEntryPathList(self, topRepoPath, numProc=8):
     """Get the path list for structure entries in the input repository
     """
     ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
     logger.debug("Starting at %s", ts)
     startTime = time.time()
     pathList = []
     try:
         dataList = []
         anL = "abcdefghijklmnopqrstuvwxyz0123456789"
         for a1 in anL:
             for a2 in anL:
                 hc = a1 + a2
                 dataList.append(hc)
                 hc = a2 + a1
                 dataList.append(hc)
         dataList = list(set(dataList))
         #
         optD = {}
         optD["topRepoPath"] = topRepoPath
         mpu = MultiProcUtil(verbose=self.__verbose)
         mpu.setOptions(optionsD=optD)
         mpu.set(workerObj=self, workerMethod="_entryPathWorker")
         _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                          numProc=numProc,
                                          numResults=1)
         pathList = retLists[0]
         endTime0 = time.time()
         logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                      endTime0 - startTime)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return self.__applyFileLimit(pathList)
示例#7
0
    def __testLoadFilesMulti(self, contentType):
        """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)"""
        numProc = self.__numProc
        chunkSize = self.__chunkSize
        try:
            #
            sd, _, _, _ = self.__schP.getSchemaInfo(contentType)
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)

            optD = {}
            optD["sd"] = sd
            if contentType == "pdbx":
                optD["skip"] = self.__tableIdSkipD
            else:
                optD["skip"] = {}

            #
            pathList = self.__getPathList(fType=contentType)
            logger.debug("Input path list %r", pathList)
            mpu = MultiProcUtil(verbose=True)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="loadInsertMany")
            ok, _, _, _ = mpu.runMulti(dataList=pathList,
                                       numProc=numProc,
                                       numResults=1,
                                       chunkSize=chunkSize)
            self.assertEqual(ok, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
    def testLogStringStreamMultiProc(self):
        """Test case -   context manager - to custom string stream

        Problems with this test during coverage tests-
        """
        try:

            #
            myLen = self.__logRecordMax
            dataList = [i for i in range(1, myLen + 1)]
            logger.debug("dataList %d:  %r", len(dataList), dataList)
            #
            slogger = logging.getLogger()
            slogger.propagate = False
            for handler in slogger.handlers:
                slogger.removeHandler(handler)
            #
            stream = StringIO()
            sh = logging.StreamHandler(stream=stream)
            sh.setLevel(logging.DEBUG)
            fmt = logging.Formatter("STRING-%(processName)s: %(message)s")
            sh.setFormatter(fmt)
            slogger.addHandler(sh)
            #
            logger.debug("Starting string stream logging(root)")
            #
            with MultiProcLogging(logger=slogger,
                                  fmt=self.__mpFormat,
                                  level=logging.DEBUG):
                numProc = 2
                chunkSize = 0
                optD = {}
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="workerOne")
                ok, failList, _, _ = mpu.runMulti(dataList=dataList,
                                                  numProc=numProc,
                                                  numResults=1,
                                                  chunkSize=chunkSize)
                self.assertEqual(len(failList), 0)
                self.assertTrue(ok)
                #

            #
            sh.flush()
            slogger.removeHandler(sh)
            #
            stream.seek(0)
            logLines = stream.readlines()
            logger.debug(">> dataList %d:  %r", len(logLines), logLines)
            # self.assertGreaterEqual(len(logLines), myLen)
            # Temporary tweak
            self.assertGreaterEqual(len(logLines), int(myLen / 2))
            # for line in logLines:
            #    self.assertIn("context logging record", line)
        except Exception as e:
            logger.exception("context logging record %s", str(e))
            self.fail()
示例#9
0
    def __calculateNeighbors(self,
                             distLimit=5.0,
                             numProc=2,
                             chunkSize=10,
                             updateOnly=False):
        """Calculate non-polymer target interactions for all repository structure files.

        Args:
            distLimit (float, optional): interaction distance limit. Defaults to 5.0.
            numProc (int, optional): number of processes to use. Defaults to 2.
            chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10.

        Returns:
            (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...}
        """
        contentType = "pdbx"
        mergeContent = None
        rD = {}
        exD = {}
        #
        # updateOnly - will reuse any existing data loaded when this is instantiated
        #              otherwise the cache context is cleared before the calculation.
        if updateOnly:
            exD = {k: True for k in self.getEntries()}
            rD = self.__neighborD[
                "entries"] if "entries" in self.__neighborD else {}
        #
        locatorObjList = self.__rpP.getLocatorObjList(
            contentType=contentType,
            mergeContentTypes=mergeContent,
            excludeIds=exD)
        logger.info("Starting with %d numProc %d updateOnly (%r)",
                    len(locatorObjList), self.__numProc, updateOnly)
        #
        rWorker = TargetInteractionWorker(self.__rpP)
        mpu = MultiProcUtil(verbose=True)
        optD = {"distLimit": distLimit}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="build")
        ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        if failList:
            logger.info("Target interaction build failures (%d): %r",
                        len(failList), failList)
        #
        for (entryId, nD) in resultList[0]:
            rD[entryId] = nD
        #
        logger.info(
            "Completed with multi-proc status %r failures %r total entries with data (%d)",
            ok, len(failList), len(rD))
        return rD
示例#10
0
 def __updateReferenceData(self, idList):
     numProc = self.__numProc
     chunkSize = self.__maxChunkSize
     logger.info("Length starting list is %d", len(idList))
     optD = {"maxChunkSize": chunkSize}
     rWorker = ReferenceUpdateWorker(self.__cfgOb)
     mpu = MultiProcUtil(verbose=True)
     mpu.setOptions(optD)
     mpu.set(workerObj=rWorker, workerMethod="updateList")
     ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
     logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
     return ok, failList
    def testLogFileHandlerMultiProc(self):
        """Test case -  context manager - to string stream and custom file stream"""
        try:

            #
            myLen = self.__logRecordMax
            dataList = [i for i in range(1, myLen + 1)]
            logger.debug("dataList %d:  %r", len(dataList), dataList)
            #
            # For multiprocessing start with the root logger ...
            flogger = logging.getLogger()
            for handler in flogger.handlers:
                flogger.removeHandler(handler)
            fh = logging.FileHandler(self.__testLogPath,
                                     mode="w",
                                     encoding="utf-8")
            fh.setLevel(logging.DEBUG)
            fmt = logging.Formatter("FILE-%(processName)s: %(message)s")
            fh.setFormatter(fmt)
            flogger.addHandler(fh)
            #
            #
            with MultiProcLogging(logger=flogger,
                                  fmt=self.__mpFormat,
                                  level=logging.DEBUG):
                numProc = 2
                chunkSize = 0
                optD = {}
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="workerOne")
                ok, failList, _, _ = mpu.runMulti(dataList=dataList,
                                                  numProc=numProc,
                                                  numResults=1,
                                                  chunkSize=chunkSize)
                self.assertEqual(len(failList), 0)
                self.assertTrue(ok)
                #
            fh.close()
            flogger.removeHandler(fh)

            #
            logLines = []
            with open(self.__testLogPath, "r", encoding="utf-8") as ifh:
                for line in ifh:
                    logLines.append(line)
            self.assertGreaterEqual(len(logLines), myLen)
            # for line in logLines:
            #    self.assertIn("context logging record", line)
        except Exception as e:
            logger.exception("context logging record %s", str(e))
            self.fail()
示例#12
0
    def build(self, alignType="relaxed-stereo", numProc=4, chunkSize=10, verbose=False):
        """Run the model build step in the chemical component model workflow.

        Args:
          alignType (str):  "relaxed"|"strict"| relaxed-stereo".  Default: relaxed-stereo
          numProc (int, optional): number of processes to invoke. Defaults to 4.
          chunkSize (int, optional): work chunksize. Defaults to 10.
          verbose (bool, optional): verbose logging.  Defaults to False.

        Returns:
            (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }]

        """
        retD = {}
        try:
            ccms = ChemCompModelSearch(self.__cachePath, None, None, prefix=self.__prefix)
            modelDirPath = self.getModelDirFilePath()
            imageDirPath = self.getModelImageDirFilePath()
            #
            idxPathD = ccms.getResultIndex()
            idxPathL = list(idxPathD.values())
            pD = {}
            for sId in idxPathD:
                parentId = sId.split("|")[0]
                pD.setdefault(parentId, []).append(sId)
            logger.info("Using search result index length ridxD (%d) parent coverage (%d)", len(idxPathD), len(pD))
            #
            pU = ChemCompModelBuildWorker(self.__cachePath, verbose=verbose)
            mpu = MultiProcUtil(verbose=True)
            mpu.setWorkingDir(modelDirPath)
            mpu.setOptions(optionsD={"modelDirPath": modelDirPath, "imageDirPath": imageDirPath, "alignType": alignType, "ccSIdxP": self.__ccSIdxP})
            #
            mpu.set(workerObj=pU, workerMethod="build")

            ok, failList, resultList, _ = mpu.runMulti(dataList=idxPathL, numProc=numProc, numResults=1, chunkSize=chunkSize)
            logger.info("Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList))
            successList = copy.copy(resultList[0])
            for tD in successList:
                retD.setdefault(tD["parentId"], []).append(tD)
            #
            if retD:
                logger.info("Completed build with models for %d parent chemical definitions", len(retD))
            else:
                logger.info("No models built")
            ok = self.storeModelIndex(retD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return retD
示例#13
0
    def runSearch(self,
                  molFilePathList,
                  resultPath,
                  searchType="similarity",
                  numProc=4,
                  chunkSize=10,
                  timeOut=120):
        """Run CCDC search in multiprocess mode.

        Args:
            molFilePathList (list): input mol2/sdf path list to search
            resultPath (str): directory path to store results
            searchType (str, optional): search type (substructure|similarity). Defaults to "similarity".
            numProc (int, optional): number of processes to invoke. Defaults to 4.
            chunkSize (int, optional): work chunksize. Defaults to 10.
            timeOut (int, optional): search timeout Defaults: 120 seconds.
        """
        logger.info("Starting with molfile path list length %d",
                    len(molFilePathList))
        successList = []
        try:
            pU = CcdcSearchExecWorker(verbose=self.__verbose)
            mpu = MultiProcUtil(verbose=True)
            mpu.setWorkingDir(resultPath)
            mpu.setOptions(
                optionsD={
                    "resultPath": resultPath,
                    "searchType": searchType,
                    "pythonRootPath": self.__pythonRootPath,
                    "csdHome": self.__csdHome,
                    "timeOut": timeOut
                })
            #
            mpu.set(workerObj=pU, workerMethod="search")

            ok, failList, resultList, _ = mpu.runMulti(
                dataList=molFilePathList,
                numProc=numProc,
                numResults=1,
                chunkSize=chunkSize)
            logger.info(
                "Run ended with status %r success count %d failures %r", ok,
                len(resultList[0]), len(failList))
            successList = copy.copy(resultList[0])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return successList
示例#14
0
 def __getActivityMulti(self, idList, atL, maxActivity=None, numProc=2, chunkSize=5):
     """ """
     rD = {}
     ctaW = ChEMBLTargetActivityWorker()
     mpu = MultiProcUtil(verbose=True)
     optD = {"attributeList": atL, "chunkSize": chunkSize, "maxActivity": maxActivity}
     mpu.setOptions(optD)
     mpu.set(workerObj=ctaW, workerMethod="fetchActivity")
     ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize)
     if failList:
         logger.info("Target Id activity failures (%d): %r", len(failList), failList)
     #
     for (targetId, actD) in resultList[0]:
         rD.setdefault(targetId, []).append(actD)
     #
     logger.info("Completed with multi-proc status %r failures %r total targets with data (%d)", ok, len(failList), len(rD))
     return rD
示例#15
0
 def __buildChemCompSearchIndexMulti(self, ccObjD, descrD, limitPerceptions=False, molLimit=None, numProc=2, maxChunkSize=20, quietFlag=False):
     #
     ccIdList = sorted(ccObjD.keys())[:molLimit] if molLimit else sorted(ccObjD.keys())
     logger.info("Input definition length %d numProc %d limitPerceptions %r", len(ccIdList), numProc, limitPerceptions)
     #
     rWorker = ChemCompSearchIndexWorker(ccObjD)
     # mpu = MultiProcPoolUtil(verbose=True)
     mpu = MultiProcUtil(verbose=True)
     optD = {"maxChunkSize": maxChunkSize, "limitPerceptions": limitPerceptions, "quietFlag": quietFlag, "descrD": descrD}
     mpu.setOptions(optD)
     mpu.set(workerObj=rWorker, workerMethod="buildRelatedList")
     ok, failList, resultList, _ = mpu.runMulti(dataList=ccIdList, numProc=numProc, numResults=1, chunkSize=maxChunkSize)
     if failList:
         logger.info("Index definitions with failures (%d): %r", len(failList), failList)
     logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0]))
     # JDW
     rD = {vD["name"]: vD for vD in resultList[0]}
     return rD
    def __searchSubStructureMulti(self,
                                  oeQueryMol,
                                  idxList,
                                  matchOpts="graph-relaxed",
                                  numProc=2,
                                  maxChunkSize=10):
        #
        hL = []
        startTime = time.time()
        try:
            searchType = "exhaustive-substructure"
            if idxList:
                searchType = "prefilterd-substructure"
            idxList = idxList if idxList else list(
                range(self.__oeMolDb.GetMaxMolIdx()))
            #

            rWorker = OeSubStructSearchWorker(oeQueryMol,
                                              self.__oeMolDb,
                                              matchOpts=matchOpts)
            mpu = MultiProcUtil(verbose=True)
            optD = {"maxChunkSize": maxChunkSize}
            mpu.setOptions(optD)
            mpu.set(workerObj=rWorker, workerMethod="subStructureSearch")
            _, _, resultList, _ = mpu.runMulti(dataList=idxList,
                                               numProc=numProc,
                                               numResults=2,
                                               chunkSize=maxChunkSize)
            logger.debug("Multi-proc result length %d/%d", len(resultList[0]),
                         len(resultList[1]))
            for idx, score in resultList[1]:
                ccId = self.__oeMolDb.GetTitle(idx)
                hL.append(
                    MatchResults(ccId=ccId,
                                 searchType=searchType,
                                 matchOpts=matchOpts,
                                 fpScore=score))
            retStatus = True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            retStatus = False
        logger.info("Substructure search returns %d (%.4f seconds)", len(hL),
                    time.time() - startTime)
        return retStatus, hL
    def __updateReferenceData(self, idList, searchIdxD, **kwargs):
        """Launch worker methods to update chemical reference data correspondences.

        Args:
            idList (list): list of local chemical identifiers (ChemIdentifier())

        Returns:
            (bool, list): status flag, list of unmatched identifiers
        """
        numProc = 1
        chunkSize = 50
        exportPath = kwargs.get("exportPath", None)
        logger.info("Length starting list is %d", len(idList))
        optD = {
            "chunkSize": chunkSize,
            "exportPath": exportPath,
            "matchIdOnly": True
        }
        rWorker = PubChemUpdateWorker(self.__cfgOb, searchIdxD)
        if numProc > 1:
            mpu = MultiProcUtil(verbose=True)
            mpu.setOptions(optD)
            mpu.set(workerObj=rWorker, workerMethod="updateList")
            ok, failList, resultList, _ = mpu.runMulti(dataList=idList,
                                                       numProc=numProc,
                                                       numResults=2,
                                                       chunkSize=chunkSize)
            logger.info("Multi-proc %r failures %r result lengths %r %r", ok,
                        len(failList), len(resultList[0]), len(resultList[1]))
        else:
            successList, _, _, _ = rWorker.updateList(idList, "SingleProc",
                                                      optD, self.__dirPath)
            failList = list(set(idList) - set(successList))
            ok = len(failList) > 0
            logger.info("Single-proc status %r failures %r", ok, len(failList))
        #
        return ok, failList
示例#18
0
    def scanContentType(self,
                        contentType,
                        mergeContentTypes=None,
                        scanType="full",
                        inputPathList=None,
                        scanDataFilePath=None,
                        failedFilePath=None,
                        saveInputFileListPath=None):
        """Driver method for repository scan operation

        Args:
            contentType (str):  one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx'
            scanType (str, optional): 'full' [or 'incr' to be supported]
            inputPathList (list, optional):  list of input file paths to scan
            scanDataFilePath (str, optional): file path for serialized scan data (Pickle format)
            failedFilePath (str, optional): file path for list of files that fail scanning operation
            saveInputFileListPath str, optional): Path to store file path list that is scanned

        Returns:
            bool: True for success or False otherwise

        """
        try:
            startTime = self.__begin(message="scanning operation")
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
            #
            if saveInputFileListPath:
                self.__mU.doExport(saveInputFileListPath,
                                   self.__rpP.getLocatorPaths(locatorObjList),
                                   fmt="list")
                logger.debug("Saving %d paths in %s", len(locatorObjList),
                             saveInputFileListPath)
            #
            optD = {}
            optD["contentType"] = contentType
            optD["logSize"] = True
            optD["scanType"] = scanType
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            numProc = self.__numProc
            chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len(
                locatorObjList) else 0
            #
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numPaths = len(locatorObjList)
            logger.debug("Processing %d total paths", numPaths)
            numProc = min(numProc, numPaths)
            maxStepLength = self.__maxStepLength
            if numPaths > maxStepLength:
                numLists = int(numPaths / maxStepLength)
                subLists = [
                    locatorObjList[i::numLists] for i in range(numLists)
                ]
            else:
                subLists = [locatorObjList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            numResults = 1
            failList = []
            retLists = [[] for ii in range(numResults)]
            diagList = []
            for ii, subList in enumerate(subLists):
                logger.debug("Running outer subtask %d or %d length %d",
                             ii + 1, len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="scanWorker")
                ok, failListT, retListsT, diagListT = mpu.runMulti(
                    dataList=subList,
                    numProc=numProc,
                    numResults=numResults,
                    chunkSize=chunkSize)
                failList.extend(failListT)
                # retLists is a list of lists -
                for _ in range(numResults):
                    retLists[ii].extend(retListsT[ii])
                diagList.extend(diagListT)
            logger.debug("Scan failed path list %r", failList)
            logger.debug(
                "Scan path list success length %d load list failed length %d",
                len(locatorObjList), len(failList))
            logger.debug("Returned metadata length %r", len(retLists[0]))
            #
            if failedFilePath and failList:
                wOk = self.__mU.doExport(failedFilePath,
                                         self.__rpP.getLocatorPaths(failList),
                                         fmt="list")
                logger.debug("Writing scan failure path list to %s status %r",
                             failedFilePath, wOk)
            #
            if scanType == "incr":
                scanDataD = self.__mU.doImport(scanDataFilePath,
                                               fmt="pickle",
                                               default=None)
                logger.debug("Imported scan data with keys %r",
                             list(scanDataD.keys()))
            else:
                scanDataD = {}
            #
            if scanDataFilePath and retLists[0]:
                for ssTup in retLists[0]:
                    cId = ssTup.containerId
                    if scanType == "full" and cId in scanDataD:
                        logger.error("Duplicate container id %s in %r and %r",
                                     cId, ssTup.fromPath,
                                     scanDataD[cId].fromPath)
                    #
                    scanDataD[cId] = ssTup

                ok = self.__mU.doExport(scanDataFilePath,
                                        scanDataD,
                                        fmt="pickle")
                tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
                ok = tscanDataD == scanDataD

            self.__end(startTime, "scanning operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False
示例#19
0
    def load(self,
             databaseName,
             collectionName,
             loadType="full",
             documentList=None,
             indexAttributeList=None,
             keyNames=None,
             schemaLevel="full",
             addValues=None):
        """Driver method for loading MongoDb content -


        loadType:     "full" or "replace"

        """
        try:
            startTime = self.__begin(message="loading operation")
            #

            #
            optionsD = {}
            optionsD["collectionName"] = collectionName
            optionsD["databaseName"] = databaseName
            optionsD["readBackCheck"] = self.__readBackCheck
            optionsD["loadType"] = loadType
            optionsD["keyNames"] = keyNames
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            docList = documentList[:self.
                                   __documentLimit] if self.__documentLimit else documentList
            logger.debug("Full document list length %d limit %r",
                         len(documentList), self.__documentLimit)
            numProc = self.__numProc
            chunkSize = self.__chunkSize if docList and self.__chunkSize < len(
                docList) else 0
            #
            if addValues:
                try:
                    for doc in docList:
                        for k, v in addValues.items():
                            doc[k] = v
                except Exception as e:
                    logger.error("Add values %r fails with %s", addValues,
                                 str(e))

            #
            indAtList = indexAttributeList if indexAttributeList else []
            bsonSchema = None
            if schemaLevel and schemaLevel in ["min", "full"]:
                bsonSchema = self.__schP.getJsonSchema(databaseName,
                                                       collectionName,
                                                       encodingType="BSON",
                                                       level=schemaLevel)
                logger.debug("Using schema validation for %r %r %r",
                             databaseName, collectionName, schemaLevel)

            if loadType == "full":
                self.__removeCollection(databaseName, collectionName)
                ok = self.__createCollection(databaseName,
                                             collectionName,
                                             indAtList,
                                             bsonSchema=bsonSchema)
                logger.info("Collection %s create status %r", collectionName,
                            ok)
            elif loadType == "append":
                # create only if object does not exist -
                ok = self.__createCollection(databaseName,
                                             collectionName,
                                             indexAttributeNames=indAtList,
                                             checkExists=True,
                                             bsonSchema=bsonSchema)
                logger.debug("Collection %s create status %r", collectionName,
                             ok)
                # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numDocs = len(docList)
            logger.debug("Processing %d total documents", numDocs)
            numProc = min(numProc, numDocs)
            maxStepLength = self.__maxStepLength
            if numDocs > maxStepLength:
                numLists = int(numDocs / maxStepLength)
                subLists = [docList[i::numLists] for i in range(numLists)]
            else:
                subLists = [docList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            failList = []
            for ii, subList in enumerate(subLists):
                logger.debug("Running outer subtask %d of %d length %d",
                             ii + 1, len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optionsD)
                mpu.set(workerObj=self, workerMethod="loadWorker")
                ok, failListT, _, _ = mpu.runMulti(dataList=subList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
                failList.extend(failListT)
            logger.debug("Completed load with failing document list %r",
                         failList)
            logger.debug("Document list length %d failed load list length %d",
                         len(docList), len(failList))
            #

            self.__end(startTime, "loading operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False
示例#20
0
    def build(self,
              alignType="relaxed-stereo",
              numProc=4,
              chunkSize=10,
              verbose=False,
              doFigures=True):
        """Run the model build step in the chemical component model workflow.

        Args:
          alignType (str):  "relaxed"|"strict"| relaxed-stereo".  Default: relaxed-stereo
          numProc (int, optional): number of processes to invoke. Defaults to 4.
          chunkSize (int, optional): work chunksize. Defaults to 10.
          verbose (bool, optional): verbose logging.  Defaults to False.

        Returns:
            (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }]

        """
        retD = {}
        try:
            mU = MarshalUtil(workPath=self.__cachePath)
            ccms = CODModelSearch(self.__cachePath, prefix=self.__prefix)
            modelDirPath = self.getModelDirFilePath()
            imageDirPath = self.getModelImageDirFilePath()
            #
            tD = ccms.getResultIndex()
            # Make parent index ---
            idxIdD = {}
            for idxId, iDL in tD.items():
                pId = idxId.split("|")[0]
                idxIdD.setdefault(pId, []).extend(iDL)
            #
            idxIdL = list(idxIdD.keys())
            midxIdL = []
            for pId in idxIdL:
                fp = os.path.join(modelDirPath, pId, "model-index.json")
                if mU.exists(fp):
                    # Skip empty indices
                    fst = os.stat(fp)
                    if fst.st_size > 10:
                        continue
                midxIdL.append(pId)
            #
            logger.info(
                "Starting COD model build using (%d) from a total of results length (%d)",
                len(midxIdL), len(idxIdD))
            #
            cmbw = CODModelBuildWorker(self.__cachePath,
                                       verbose=verbose,
                                       timeOut=self.__timeOut)
            mpu = MultiProcUtil(verbose=True)
            mpu.setWorkingDir(modelDirPath)
            mpu.setOptions(
                optionsD={
                    "modelDirPath": modelDirPath,
                    "imageDirPath": imageDirPath,
                    "alignType": alignType,
                    "ccSIdxP": self.__ccSIdxP,
                    "idxIdD": idxIdD,
                    "oesmP": self.__oesmP,
                    "ccmP": self.__ccmP,
                    "doFigures": doFigures,
                })
            #
            mpu.set(workerObj=cmbw, workerMethod="build")
            ok, failList, resultList, _ = mpu.runMulti(dataList=midxIdL,
                                                       numProc=numProc,
                                                       numResults=1,
                                                       chunkSize=chunkSize)
            logger.info(
                "Run ended with status %r success count %d failures %r", ok,
                len(resultList[0]), len(failList))
            successList = copy.copy(resultList[0])
            #
            if successList:
                logger.info("Completed build with %d models ",
                            len(successList))
            else:
                logger.info("No models built")
            #
            # Build full index -
            #
            logger.info("Building full model index")
            for pId in idxIdL:
                fp = os.path.join(modelDirPath, pId, "model-index.json")
                if mU.exists(fp):
                    tDL = mU.doImport(fp, fmt="json")
                    for tD in tDL:
                        retD.setdefault(tD["parentId"], []).append(tD)
            #
            retD = dict(sorted(retD.items()))
            logger.info("Storing models for %d parent components", len(retD))
            ok = self.storeModelIndex(retD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return retD