def __getChemCompPathList(self, topRepoPath, numProc=8): """Get the path list for the chemical component definition repository """ ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" dataList = [a for a in dataS] optD = {} optD["topRepoPath"] = topRepoPath mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_chemCompPathWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Path list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList)
def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True): rcD = {} cD = self.__getSearchResults() idList = list(cD.keys()) # --- mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(self.__cachePath) mpu.setOptions(optionsD={ "resultPath": self.__cachePath, "cD": cD, "useCache": useCache }) mpu.set(workerObj=self, workerMethod="fetchDataWorker") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info("Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) for rTup in resultList[0]: rcD[rTup[0]] = rTup[1] # --- ok = self.storeResultIndex(rcD) logger.info("Final match result (w/sdf and metadata) (%d/%d)", len(rcD), len(cD)) return True
def __descriptorQueryMulti(self, queryPairList, queryType, matchOpts, numProc=2, chunkSize=50): """Internal method to invoke descriptor query in multiprocess mode Args: queryPairList (list): [(identifer, descriptor), ... ] queryType (str): SMILES|InChI matchOpts (str): match criteria (e.g., graph-strict) numProc (int, optional): number of multiprocess cores. Defaults to 2. chunkSize (int, optional): multiprocess batch size. Defaults to 50. Returns: (list): [BatchResults(), ...] """ logger.info("Input %r query length %d using %s numProc %d", queryType, len(queryPairList), matchOpts, numProc) rWorker = BatchChemSearchWorker(self.__ccsw) mpu = MultiProcUtil(verbose=True) optD = {"matchOpts": matchOpts, "queryType": queryType} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="searchDescriptorList") ok, failList, resultList, _ = mpu.runMulti(dataList=queryPairList, numProc=numProc, numResults=1, chunkSize=chunkSize) if failList: logger.debug("Search completed with failures (%d)", len(failList)) logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0])) # rL = [] for tup in resultList[0]: for mr in tup[3]: rL.append(BatchResults(queryId=tup[0], query=tup[1], queryType=tup[2], matchOpts=matchOpts, ccId=mr.ccId, fpScore=mr.fpScore)) # logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(rL)) # return rL
def runSearch(self, molFilePathList, resultPath, searchType="similarity", numProc=4, chunkSize=10): """""" logger.info("Starting with molfile path list length %d", len(molFilePathList)) try: pU = CcdcSearchWorker(verbose=self.__verbose) mpu = MultiProcUtil(verbose=True) # mpu.setWorkingDir(resultPath) mpu.setOptions(optionsD={ "resultPath": resultPath, "searchType": searchType }) # mpu.set(workerObj=pU, workerMethod="search") ok, failList, resultList, _ = mpu.runMulti( dataList=molFilePathList, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info("run ended status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) except Exception as e: logger.exception("Failing with %s", str(e))
def load(self, idList, exportPath=None, numProc=1, chunkSize=5): """Fetch and load reference data for the input list of PubChem compound codes. Args: idList (list): PubChem ID codes exportPath (str, optional): store raw fetched data in this path. Defaults to None. numProc (int, optional): number of processor to use. Defaults to 1. chunkSize (int, optional): chunk size between data store updates. Defaults to 5. Returns: (bool, list): status flag, list of failed identifiers """ logger.info("Length starting list is %d", len(idList)) optD = {"chunkSize": chunkSize, "exportPath": exportPath} rWorker = PubChemDataUpdateWorker(self.__cfgOb) if numProc > 1: mpu = MultiProcUtil(verbose=True) mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="updateList") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize) logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1])) else: successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, None) failList = list(set(idList) - set(successList)) ok = len(failList) == 0 logger.info("Single-proc status %r failures %r", ok, len(failList)) # return ok, failList
def __getEntryPathList(self, topRepoPath, numProc=8): """Get the path list for structure entries in the input repository """ ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting at %s", ts) startTime = time.time() pathList = [] try: dataList = [] anL = "abcdefghijklmnopqrstuvwxyz0123456789" for a1 in anL: for a2 in anL: hc = a1 + a2 dataList.append(hc) hc = a2 + a1 dataList.append(hc) dataList = list(set(dataList)) # optD = {} optD["topRepoPath"] = topRepoPath mpu = MultiProcUtil(verbose=self.__verbose) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="_entryPathWorker") _, _, retLists, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1) pathList = retLists[0] endTime0 = time.time() logger.debug("Path list length %d in %.4f seconds", len(pathList), endTime0 - startTime) except Exception as e: logger.exception("Failing with %s", str(e)) return self.__applyFileLimit(pathList)
def __testLoadFilesMulti(self, contentType): """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)""" numProc = self.__numProc chunkSize = self.__chunkSize try: # sd, _, _, _ = self.__schP.getSchemaInfo(contentType) if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) optD = {} optD["sd"] = sd if contentType == "pdbx": optD["skip"] = self.__tableIdSkipD else: optD["skip"] = {} # pathList = self.__getPathList(fType=contentType) logger.debug("Input path list %r", pathList) mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="loadInsertMany") ok, _, _, _ = mpu.runMulti(dataList=pathList, numProc=numProc, numResults=1, chunkSize=chunkSize) self.assertEqual(ok, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testLogStringStreamMultiProc(self): """Test case - context manager - to custom string stream Problems with this test during coverage tests- """ try: # myLen = self.__logRecordMax dataList = [i for i in range(1, myLen + 1)] logger.debug("dataList %d: %r", len(dataList), dataList) # slogger = logging.getLogger() slogger.propagate = False for handler in slogger.handlers: slogger.removeHandler(handler) # stream = StringIO() sh = logging.StreamHandler(stream=stream) sh.setLevel(logging.DEBUG) fmt = logging.Formatter("STRING-%(processName)s: %(message)s") sh.setFormatter(fmt) slogger.addHandler(sh) # logger.debug("Starting string stream logging(root)") # with MultiProcLogging(logger=slogger, fmt=self.__mpFormat, level=logging.DEBUG): numProc = 2 chunkSize = 0 optD = {} mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="workerOne") ok, failList, _, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1, chunkSize=chunkSize) self.assertEqual(len(failList), 0) self.assertTrue(ok) # # sh.flush() slogger.removeHandler(sh) # stream.seek(0) logLines = stream.readlines() logger.debug(">> dataList %d: %r", len(logLines), logLines) # self.assertGreaterEqual(len(logLines), myLen) # Temporary tweak self.assertGreaterEqual(len(logLines), int(myLen / 2)) # for line in logLines: # self.assertIn("context logging record", line) except Exception as e: logger.exception("context logging record %s", str(e)) self.fail()
def __calculateNeighbors(self, distLimit=5.0, numProc=2, chunkSize=10, updateOnly=False): """Calculate non-polymer target interactions for all repository structure files. Args: distLimit (float, optional): interaction distance limit. Defaults to 5.0. numProc (int, optional): number of processes to use. Defaults to 2. chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10. Returns: (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...} """ contentType = "pdbx" mergeContent = None rD = {} exD = {} # # updateOnly - will reuse any existing data loaded when this is instantiated # otherwise the cache context is cleared before the calculation. if updateOnly: exD = {k: True for k in self.getEntries()} rD = self.__neighborD[ "entries"] if "entries" in self.__neighborD else {} # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContent, excludeIds=exD) logger.info("Starting with %d numProc %d updateOnly (%r)", len(locatorObjList), self.__numProc, updateOnly) # rWorker = TargetInteractionWorker(self.__rpP) mpu = MultiProcUtil(verbose=True) optD = {"distLimit": distLimit} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList, numProc=numProc, numResults=1, chunkSize=chunkSize) if failList: logger.info("Target interaction build failures (%d): %r", len(failList), failList) # for (entryId, nD) in resultList[0]: rD[entryId] = nD # logger.info( "Completed with multi-proc status %r failures %r total entries with data (%d)", ok, len(failList), len(rD)) return rD
def __updateReferenceData(self, idList): numProc = self.__numProc chunkSize = self.__maxChunkSize logger.info("Length starting list is %d", len(idList)) optD = {"maxChunkSize": chunkSize} rWorker = ReferenceUpdateWorker(self.__cfgOb) mpu = MultiProcUtil(verbose=True) mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="updateList") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize) logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1])) return ok, failList
def testLogFileHandlerMultiProc(self): """Test case - context manager - to string stream and custom file stream""" try: # myLen = self.__logRecordMax dataList = [i for i in range(1, myLen + 1)] logger.debug("dataList %d: %r", len(dataList), dataList) # # For multiprocessing start with the root logger ... flogger = logging.getLogger() for handler in flogger.handlers: flogger.removeHandler(handler) fh = logging.FileHandler(self.__testLogPath, mode="w", encoding="utf-8") fh.setLevel(logging.DEBUG) fmt = logging.Formatter("FILE-%(processName)s: %(message)s") fh.setFormatter(fmt) flogger.addHandler(fh) # # with MultiProcLogging(logger=flogger, fmt=self.__mpFormat, level=logging.DEBUG): numProc = 2 chunkSize = 0 optD = {} mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="workerOne") ok, failList, _, _ = mpu.runMulti(dataList=dataList, numProc=numProc, numResults=1, chunkSize=chunkSize) self.assertEqual(len(failList), 0) self.assertTrue(ok) # fh.close() flogger.removeHandler(fh) # logLines = [] with open(self.__testLogPath, "r", encoding="utf-8") as ifh: for line in ifh: logLines.append(line) self.assertGreaterEqual(len(logLines), myLen) # for line in logLines: # self.assertIn("context logging record", line) except Exception as e: logger.exception("context logging record %s", str(e)) self.fail()
def build(self, alignType="relaxed-stereo", numProc=4, chunkSize=10, verbose=False): """Run the model build step in the chemical component model workflow. Args: alignType (str): "relaxed"|"strict"| relaxed-stereo". Default: relaxed-stereo numProc (int, optional): number of processes to invoke. Defaults to 4. chunkSize (int, optional): work chunksize. Defaults to 10. verbose (bool, optional): verbose logging. Defaults to False. Returns: (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }] """ retD = {} try: ccms = ChemCompModelSearch(self.__cachePath, None, None, prefix=self.__prefix) modelDirPath = self.getModelDirFilePath() imageDirPath = self.getModelImageDirFilePath() # idxPathD = ccms.getResultIndex() idxPathL = list(idxPathD.values()) pD = {} for sId in idxPathD: parentId = sId.split("|")[0] pD.setdefault(parentId, []).append(sId) logger.info("Using search result index length ridxD (%d) parent coverage (%d)", len(idxPathD), len(pD)) # pU = ChemCompModelBuildWorker(self.__cachePath, verbose=verbose) mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(modelDirPath) mpu.setOptions(optionsD={"modelDirPath": modelDirPath, "imageDirPath": imageDirPath, "alignType": alignType, "ccSIdxP": self.__ccSIdxP}) # mpu.set(workerObj=pU, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=idxPathL, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info("Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) successList = copy.copy(resultList[0]) for tD in successList: retD.setdefault(tD["parentId"], []).append(tD) # if retD: logger.info("Completed build with models for %d parent chemical definitions", len(retD)) else: logger.info("No models built") ok = self.storeModelIndex(retD) except Exception as e: logger.exception("Failing with %s", str(e)) return retD
def runSearch(self, molFilePathList, resultPath, searchType="similarity", numProc=4, chunkSize=10, timeOut=120): """Run CCDC search in multiprocess mode. Args: molFilePathList (list): input mol2/sdf path list to search resultPath (str): directory path to store results searchType (str, optional): search type (substructure|similarity). Defaults to "similarity". numProc (int, optional): number of processes to invoke. Defaults to 4. chunkSize (int, optional): work chunksize. Defaults to 10. timeOut (int, optional): search timeout Defaults: 120 seconds. """ logger.info("Starting with molfile path list length %d", len(molFilePathList)) successList = [] try: pU = CcdcSearchExecWorker(verbose=self.__verbose) mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(resultPath) mpu.setOptions( optionsD={ "resultPath": resultPath, "searchType": searchType, "pythonRootPath": self.__pythonRootPath, "csdHome": self.__csdHome, "timeOut": timeOut }) # mpu.set(workerObj=pU, workerMethod="search") ok, failList, resultList, _ = mpu.runMulti( dataList=molFilePathList, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info( "Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) successList = copy.copy(resultList[0]) except Exception as e: logger.exception("Failing with %s", str(e)) return successList
def __getActivityMulti(self, idList, atL, maxActivity=None, numProc=2, chunkSize=5): """ """ rD = {} ctaW = ChEMBLTargetActivityWorker() mpu = MultiProcUtil(verbose=True) optD = {"attributeList": atL, "chunkSize": chunkSize, "maxActivity": maxActivity} mpu.setOptions(optD) mpu.set(workerObj=ctaW, workerMethod="fetchActivity") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize) if failList: logger.info("Target Id activity failures (%d): %r", len(failList), failList) # for (targetId, actD) in resultList[0]: rD.setdefault(targetId, []).append(actD) # logger.info("Completed with multi-proc status %r failures %r total targets with data (%d)", ok, len(failList), len(rD)) return rD
def __buildChemCompSearchIndexMulti(self, ccObjD, descrD, limitPerceptions=False, molLimit=None, numProc=2, maxChunkSize=20, quietFlag=False): # ccIdList = sorted(ccObjD.keys())[:molLimit] if molLimit else sorted(ccObjD.keys()) logger.info("Input definition length %d numProc %d limitPerceptions %r", len(ccIdList), numProc, limitPerceptions) # rWorker = ChemCompSearchIndexWorker(ccObjD) # mpu = MultiProcPoolUtil(verbose=True) mpu = MultiProcUtil(verbose=True) optD = {"maxChunkSize": maxChunkSize, "limitPerceptions": limitPerceptions, "quietFlag": quietFlag, "descrD": descrD} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="buildRelatedList") ok, failList, resultList, _ = mpu.runMulti(dataList=ccIdList, numProc=numProc, numResults=1, chunkSize=maxChunkSize) if failList: logger.info("Index definitions with failures (%d): %r", len(failList), failList) logger.info("Multi-proc status %r failures %r result length %r", ok, len(failList), len(resultList[0])) # JDW rD = {vD["name"]: vD for vD in resultList[0]} return rD
def __searchSubStructureMulti(self, oeQueryMol, idxList, matchOpts="graph-relaxed", numProc=2, maxChunkSize=10): # hL = [] startTime = time.time() try: searchType = "exhaustive-substructure" if idxList: searchType = "prefilterd-substructure" idxList = idxList if idxList else list( range(self.__oeMolDb.GetMaxMolIdx())) # rWorker = OeSubStructSearchWorker(oeQueryMol, self.__oeMolDb, matchOpts=matchOpts) mpu = MultiProcUtil(verbose=True) optD = {"maxChunkSize": maxChunkSize} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="subStructureSearch") _, _, resultList, _ = mpu.runMulti(dataList=idxList, numProc=numProc, numResults=2, chunkSize=maxChunkSize) logger.debug("Multi-proc result length %d/%d", len(resultList[0]), len(resultList[1])) for idx, score in resultList[1]: ccId = self.__oeMolDb.GetTitle(idx) hL.append( MatchResults(ccId=ccId, searchType=searchType, matchOpts=matchOpts, fpScore=score)) retStatus = True except Exception as e: logger.exception("Failing with %s", str(e)) retStatus = False logger.info("Substructure search returns %d (%.4f seconds)", len(hL), time.time() - startTime) return retStatus, hL
def __updateReferenceData(self, idList, searchIdxD, **kwargs): """Launch worker methods to update chemical reference data correspondences. Args: idList (list): list of local chemical identifiers (ChemIdentifier()) Returns: (bool, list): status flag, list of unmatched identifiers """ numProc = 1 chunkSize = 50 exportPath = kwargs.get("exportPath", None) logger.info("Length starting list is %d", len(idList)) optD = { "chunkSize": chunkSize, "exportPath": exportPath, "matchIdOnly": True } rWorker = PubChemUpdateWorker(self.__cfgOb, searchIdxD) if numProc > 1: mpu = MultiProcUtil(verbose=True) mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="updateList") ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize) logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1])) else: successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath) failList = list(set(idList) - set(successList)) ok = len(failList) > 0 logger.info("Single-proc status %r failures %r", ok, len(failList)) # return ok, failList
def scanContentType(self, contentType, mergeContentTypes=None, scanType="full", inputPathList=None, scanDataFilePath=None, failedFilePath=None, saveInputFileListPath=None): """Driver method for repository scan operation Args: contentType (str): one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx' scanType (str, optional): 'full' [or 'incr' to be supported] inputPathList (list, optional): list of input file paths to scan scanDataFilePath (str, optional): file path for serialized scan data (Pickle format) failedFilePath (str, optional): file path for list of files that fail scanning operation saveInputFileListPath str, optional): Path to store file path list that is scanned Returns: bool: True for success or False otherwise """ try: startTime = self.__begin(message="scanning operation") # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, inputPathList=inputPathList, mergeContentTypes=mergeContentTypes) # if saveInputFileListPath: self.__mU.doExport(saveInputFileListPath, self.__rpP.getLocatorPaths(locatorObjList), fmt="list") logger.debug("Saving %d paths in %s", len(locatorObjList), saveInputFileListPath) # optD = {} optD["contentType"] = contentType optD["logSize"] = True optD["scanType"] = scanType # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # numProc = self.__numProc chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len( locatorObjList) else 0 # # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numPaths = len(locatorObjList) logger.debug("Processing %d total paths", numPaths) numProc = min(numProc, numPaths) maxStepLength = self.__maxStepLength if numPaths > maxStepLength: numLists = int(numPaths / maxStepLength) subLists = [ locatorObjList[i::numLists] for i in range(numLists) ] else: subLists = [locatorObjList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # numResults = 1 failList = [] retLists = [[] for ii in range(numResults)] diagList = [] for ii, subList in enumerate(subLists): logger.debug("Running outer subtask %d or %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="scanWorker") ok, failListT, retListsT, diagListT = mpu.runMulti( dataList=subList, numProc=numProc, numResults=numResults, chunkSize=chunkSize) failList.extend(failListT) # retLists is a list of lists - for _ in range(numResults): retLists[ii].extend(retListsT[ii]) diagList.extend(diagListT) logger.debug("Scan failed path list %r", failList) logger.debug( "Scan path list success length %d load list failed length %d", len(locatorObjList), len(failList)) logger.debug("Returned metadata length %r", len(retLists[0])) # if failedFilePath and failList: wOk = self.__mU.doExport(failedFilePath, self.__rpP.getLocatorPaths(failList), fmt="list") logger.debug("Writing scan failure path list to %s status %r", failedFilePath, wOk) # if scanType == "incr": scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle", default=None) logger.debug("Imported scan data with keys %r", list(scanDataD.keys())) else: scanDataD = {} # if scanDataFilePath and retLists[0]: for ssTup in retLists[0]: cId = ssTup.containerId if scanType == "full" and cId in scanDataD: logger.error("Duplicate container id %s in %r and %r", cId, ssTup.fromPath, scanDataD[cId].fromPath) # scanDataD[cId] = ssTup ok = self.__mU.doExport(scanDataFilePath, scanDataD, fmt="pickle") tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") ok = tscanDataD == scanDataD self.__end(startTime, "scanning operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False
def load(self, databaseName, collectionName, loadType="full", documentList=None, indexAttributeList=None, keyNames=None, schemaLevel="full", addValues=None): """Driver method for loading MongoDb content - loadType: "full" or "replace" """ try: startTime = self.__begin(message="loading operation") # # optionsD = {} optionsD["collectionName"] = collectionName optionsD["databaseName"] = databaseName optionsD["readBackCheck"] = self.__readBackCheck optionsD["loadType"] = loadType optionsD["keyNames"] = keyNames # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # docList = documentList[:self. __documentLimit] if self.__documentLimit else documentList logger.debug("Full document list length %d limit %r", len(documentList), self.__documentLimit) numProc = self.__numProc chunkSize = self.__chunkSize if docList and self.__chunkSize < len( docList) else 0 # if addValues: try: for doc in docList: for k, v in addValues.items(): doc[k] = v except Exception as e: logger.error("Add values %r fails with %s", addValues, str(e)) # indAtList = indexAttributeList if indexAttributeList else [] bsonSchema = None if schemaLevel and schemaLevel in ["min", "full"]: bsonSchema = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="BSON", level=schemaLevel) logger.debug("Using schema validation for %r %r %r", databaseName, collectionName, schemaLevel) if loadType == "full": self.__removeCollection(databaseName, collectionName) ok = self.__createCollection(databaseName, collectionName, indAtList, bsonSchema=bsonSchema) logger.info("Collection %s create status %r", collectionName, ok) elif loadType == "append": # create only if object does not exist - ok = self.__createCollection(databaseName, collectionName, indexAttributeNames=indAtList, checkExists=True, bsonSchema=bsonSchema) logger.debug("Collection %s create status %r", collectionName, ok) # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numDocs = len(docList) logger.debug("Processing %d total documents", numDocs) numProc = min(numProc, numDocs) maxStepLength = self.__maxStepLength if numDocs > maxStepLength: numLists = int(numDocs / maxStepLength) subLists = [docList[i::numLists] for i in range(numLists)] else: subLists = [docList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # failList = [] for ii, subList in enumerate(subLists): logger.debug("Running outer subtask %d of %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optionsD) mpu.set(workerObj=self, workerMethod="loadWorker") ok, failListT, _, _ = mpu.runMulti(dataList=subList, numProc=numProc, numResults=1, chunkSize=chunkSize) failList.extend(failListT) logger.debug("Completed load with failing document list %r", failList) logger.debug("Document list length %d failed load list length %d", len(docList), len(failList)) # self.__end(startTime, "loading operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False
def build(self, alignType="relaxed-stereo", numProc=4, chunkSize=10, verbose=False, doFigures=True): """Run the model build step in the chemical component model workflow. Args: alignType (str): "relaxed"|"strict"| relaxed-stereo". Default: relaxed-stereo numProc (int, optional): number of processes to invoke. Defaults to 4. chunkSize (int, optional): work chunksize. Defaults to 10. verbose (bool, optional): verbose logging. Defaults to False. Returns: (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }] """ retD = {} try: mU = MarshalUtil(workPath=self.__cachePath) ccms = CODModelSearch(self.__cachePath, prefix=self.__prefix) modelDirPath = self.getModelDirFilePath() imageDirPath = self.getModelImageDirFilePath() # tD = ccms.getResultIndex() # Make parent index --- idxIdD = {} for idxId, iDL in tD.items(): pId = idxId.split("|")[0] idxIdD.setdefault(pId, []).extend(iDL) # idxIdL = list(idxIdD.keys()) midxIdL = [] for pId in idxIdL: fp = os.path.join(modelDirPath, pId, "model-index.json") if mU.exists(fp): # Skip empty indices fst = os.stat(fp) if fst.st_size > 10: continue midxIdL.append(pId) # logger.info( "Starting COD model build using (%d) from a total of results length (%d)", len(midxIdL), len(idxIdD)) # cmbw = CODModelBuildWorker(self.__cachePath, verbose=verbose, timeOut=self.__timeOut) mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(modelDirPath) mpu.setOptions( optionsD={ "modelDirPath": modelDirPath, "imageDirPath": imageDirPath, "alignType": alignType, "ccSIdxP": self.__ccSIdxP, "idxIdD": idxIdD, "oesmP": self.__oesmP, "ccmP": self.__ccmP, "doFigures": doFigures, }) # mpu.set(workerObj=cmbw, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=midxIdL, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info( "Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) successList = copy.copy(resultList[0]) # if successList: logger.info("Completed build with %d models ", len(successList)) else: logger.info("No models built") # # Build full index - # logger.info("Building full model index") for pId in idxIdL: fp = os.path.join(modelDirPath, pId, "model-index.json") if mU.exists(fp): tDL = mU.doImport(fp, fmt="json") for tD in tDL: retD.setdefault(tD["parentId"], []).append(tD) # retD = dict(sorted(retD.items())) logger.info("Storing models for %d parent components", len(retD)) ok = self.storeModelIndex(retD) except Exception as e: logger.exception("Failing with %s", str(e)) return retD