def testSubStructureSearchFromIndexSelected(self): matchOpts = self.__myKwargs.get("matchOpts", "sub-struct-graph-relaxed") numProc = self.__numProcSearch oemp = OeSearchMoleculeProvider(**self.__myKwargs) ok = oemp.testCache() self.assertTrue(ok) oesU = OeSubStructSearchUtils(oemp) # ccIdxP = ChemCompSearchIndexProvider(**self.__myKwargs) ok = ccIdxP.testCache(minCount=self.__minCount) self.assertTrue(ok) ccIdxD = ccIdxP.getIndex() ky = next(iter(ccIdxD)) oeMol = oemp.getMol(ky) # for ccId in ["BNZ", "ALA"]: # ---- startTime = time.time() oeMol = oemp.getMol(ccId) # ccIdL = oesU.prefilterIndex(oeMol, ccIdxP, matchOpts=matchOpts) logger.info("%s search length %d in (%.4f seconds)", ccId, len(ccIdL), time.time() - startTime) # retStatus, mL = oesU.searchSubStructure(oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc) logger.info("%s status %r result length %d in (%.4f seconds)", ccId, retStatus, len(mL), time.time() - startTime) self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL))
def updateSearchIndex(self, useCache=False): """Rebuild the search index from source chemical component and BIRD definitions. Update the internal state of this index in the current object instance. Resource requirements 771 secs 6 proc macbook pro 7GB memory. Args: useCache (bool): False to rebuild search index and True to reload Returns: bool: True for success or false otherwise """ ok = False try: kwargs = copy.deepcopy( self.__configD["ccsiKwargs"] ) if "ccsiKwargs" in self.__configD else None if kwargs: kwargs["useCache"] = useCache siIdxP = ChemCompSearchIndexProvider(**kwargs) ok = siIdxP.testCache() self.__siIdxP = siIdxP if siIdxP else None self.__siIdx = siIdxP.getIndex() if siIdxP and ok else {} logger.info("Search index status %r index len %d", ok, len(self.__siIdx) if self.__siIdx else 0) except Exception as e: logger.exception("Failing with %s", str(e)) return ok
def __reload(self, **kwargs): """Reload the dictionary of OE molecules and related data artifacts for chemical component definitions. Args: limitPerceptions(bool): process input descriptors in essentially verbatim mode (default: True) fpTypeList (list): fingerprint type (TREE,PATH,MACCS,CIRCULAR,LINGO) screenTypeList (list): fast sub search screen type (MOLECULE, SMARTS, MDL, ... ) useCache (bool, optional): flag to use cached files. Defaults to True. cachePath (str): path to the top cache directory. Defaults to '.'. numProc (int): number processors to engage in screen substructure search database generation. suppressHydrogens (bool, optional): flag to suppress explicit hydrogens in the OE data store. molLimit (int): Returns: (bool) : True for success or False othewise """ try: useCache = kwargs.get("useCache", True) cachePath = kwargs.get("cachePath", ".") numProc = kwargs.get("numProc", 2) molLimit = kwargs.get("molLimit", None) fpTypeList = kwargs.get( "fpTypeList", ["TREE", "PATH", "MACCS", "CIRCULAR", "LINGO"]) # screenTypeList = kwargs.get("screenTypeList", ["SMARTS"]) screenTypeList = kwargs.get("screenTypeList", None) limitPerceptions = kwargs.get("limitPerceptions", False) suppressHydrogens = kwargs.get("suppressHydrogens", False) quietFlag = kwargs.get("quietFlag", True) logSizes = kwargs.get("logSizes", False) fpDbType = "STANDARD" buildScreenedDb = True # oeCount = 0 errCount = 0 failIdList = [] oeIo = OeIoUtils(quietFlag=quietFlag) # -------- oeSearchMolFilePath = os.path.join(self.__dirPath, self.__getOeSearchMolFileName()) if not useCache or (useCache and not self.__mU.exists(oeSearchMolFilePath)): cmpKwargs = { k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"] } ccsiP = ChemCompSearchIndexProvider(cachePath=cachePath, useCache=True, molLimit=molLimit, **cmpKwargs) ok = ccsiP.testCache(minCount=molLimit, logSizes=logSizes) # ---- ccIdxD = ccsiP.getIndex() if ok else {} idxCount = len(ccIdxD) # ------- JDW OE mol construction here ----- startTime = time.time() oeCount, errCount, failIdList = oeIo.buildOeBinaryMolCacheFromIndex( oeSearchMolFilePath, ccIdxD, quietFlag=quietFlag, fpTypeList=fpTypeList, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) if failIdList: logger.info("failures %r", failIdList) endTime = time.time() logger.info( "Constructed %d/%d cached oeMols (unconverted %d) (%.4f seconds)", oeCount, idxCount, errCount, endTime - startTime) # -------- oeMolDbFilePath = os.path.join(self.__dirPath, self.__getOeMolDbFileName()) if not useCache or (useCache and not self.__mU.exists(oeMolDbFilePath)): startTime = time.time() molCount = oeIo.createOeBinaryDatabaseAndIndex( oeSearchMolFilePath, oeMolDbFilePath) endTime = time.time() logger.info( "Created and stored %d indexed oeMols in OE database format (%.4f seconds)", molCount, endTime - startTime) # -------- if fpDbType == "FAST": for fpType in fpTypeList: startTime = time.time() # Fast FP search database file names fpPath = os.path.join(self.__dirPath, self.__getFastFpDbFileName(fpType)) if not useCache or (useCache and not self.__mU.exists(fpPath)): ok = oeIo.createOeFingerPrintDatabase(oeMolDbFilePath, fpPath, fpType=fpType) endTime = time.time() logger.info( "Created and stored %s fingerprint database (%.4f seconds)", fpType, endTime - startTime) # -------- if buildScreenedDb and screenTypeList: for screenType in screenTypeList: startTime = time.time() fp = os.path.join(self.__dirPath, self.__getSubSearchFileName(screenType)) if not useCache or (useCache and not self.__mU.exists(fp)): ok = oeIo.createOeSubSearchDatabase( oeSearchMolFilePath, fp, screenType=screenType, numProc=numProc) endTime = time.time() logger.info( "Constructed screened substructure database (status %r) with screenType %s (%.4f seconds)", ok, screenType, endTime - startTime) # --------- ssDb = oeIo.loadOeSubSearchDatabase( fp, screenType=screenType, numProc=numProc) ok = ssDb.NumMolecules() == oeCount # ---------- # return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def buildSearchFiles(self, **kwargs): """Build cif, sdf (optional), and mol2 files for components in the chemical component search index. Exclude ions or other extraneous molecules lacking bonds. Args: ccUrlTarget (str): locator for source chemical component dictionary (default: full public dictionary) birdUrlTarget (str): locator for source BIRD dictionary (default: full public dictionary) limitPerceptions (bool): restrict automatic perceptions in OE molecular build operations (default: False) numProc (int): number of processors useCache (bool): use existing resource file where possible (default: True) molLimit (str): limit the number to ingested chemical compont (default: None) quietFlag (bool): suppress output in OE library operations (default: True) Returns: (int): number molfiles generated """ cachePath = self.__cachePath ccUrlTarget = kwargs.get("ccUrlTarget", None) birdUrlTarget = kwargs.get("birdUrlTarget", None) molLimit = kwargs.get("molLimit", None) quietFlag = kwargs.get("quietFlag", True) fpTypeList = kwargs.get("fpTypeList", []) screenTypeList = kwargs.get("screenTypeList", []) ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full" oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe-cc-full" numProc = kwargs.get("numProc", 2) minCount = kwargs.get("minCount", 0) useCache = kwargs.get("useCache", True) useSdf = kwargs.get("useSdf", True) useMol2 = kwargs.get("useMol2", False) limitPerceptions = kwargs.get("limitPerceptions", False) logSizes = False # startTime = time.time() ccmP = ChemCompMoleculeProvider(cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix, ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, molLimit=molLimit) ok = ccmP.testCache(minCount=minCount, logSizes=logSizes) logger.info( "Completed chemical component provider load %r (%.4f seconds)", ok, time.time() - startTime) # startTime = time.time() oesmp = OeSearchMoleculeProvider( ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=cachePath, ccFileNamePrefix=ccFileNamePrefix, oeFileNamePrefix=oeFileNamePrefix, useCache=useCache, quietFlag=quietFlag, fpTypeList=fpTypeList, screenTypeList=screenTypeList, numProc=numProc, molLimit=molLimit, limitPerceptions=limitPerceptions, ) ok = oesmp.testCache() logger.info("Completed OE molecule provider load %r (%.4f seconds)", ok, time.time() - startTime) # startTime = time.time() ccSIdxP = ChemCompSearchIndexProvider( cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix, limitPerceptions=limitPerceptions, numProc=numProc) ok = ccSIdxP.testCache() logger.info( "Completed chemical component search index load %r (%.4f seconds)", ok, time.time() - startTime) # ccSIdx = ccSIdxP.getIndex() if ccSIdxP and ok else {} logger.info("Search index status %r index length %d", ok, len(ccSIdx)) # ccIdD = {} mU = MarshalUtil() oeU = OeIoUtils(dirPath=cachePath) numMols = 0 searchFileDirPath = self.getSearchDirFilePath() pathTupList = [] for sId in ccSIdx: ccId = sId.split("|")[0] # standard CIF definition if ccId not in ccIdD: cifPath = os.path.join(searchFileDirPath, ccId[0], ccId, ccId + ".cif") if not (useCache and mU.exists(cifPath)): ccMol = ccmP.getMol(ccId) if not self.__checkCif(ccMol): continue mU.doExport(cifPath, [ccMol], fmt="mmcif") # oeMol = oesmp.getMol(sId) if not self.__checkOeMol(oeMol): continue # # Sanity checks on the generated OE molecule # cifPath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".cif") if sId != ccId and not (useCache and mU.exists(cifPath)): oeccU = OeChemCompUtils() ok = oeccU.addOeMol(sId, oeMol, missingModelXyz=True, writeIdealXyz=False) if ok: oeccU.write(cifPath) if useSdf: molFilePath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".sdf") if not (useCache and mU.exists(molFilePath)): ok = oeU.write(molFilePath, oeMol, constantMol=False, addSdTags=True) if ok: pathTupList.append((sId, molFilePath, "sdf")) # if useMol2: mol2FilePath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".mol2") if not (useCache and mU.exists(mol2FilePath)): oeU.write(mol2FilePath, oeMol, constantMol=False, addSdTags=True) if ok: pathTupList.append((sId, mol2FilePath, "mol2")) numMols += 1 # self.__storePathList(pathTupList) return numMols