def testExtractEntriesBefore(self): """Test case - extract entries subject to date restriction""" try: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True, before={"days": 365 * 5}) tD = tU.getDateTimeObj(tS) obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_entry", useCache=False, keyAttribute="entry", uniqueAttributes=["rcsb_id"], selectionQuery={ "rcsb_accession_info.initial_release_date": { "$gt": tD } }, selectionList=["rcsb_id", "rcsb_accession_info"], ) eD = obEx.getObjects() eCount = obEx.getCount() logger.info("Entry count is %d", eCount) logger.info("Entries are %r", list(eD.keys())) self.assertGreaterEqual(eCount, 6) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __getMatchIndexIds(self, searchIdxD, expireDays=0, updateUnmatched=True): """Get CCD/BIRD reference data identifiers in the current match index subject to an expiration interval (i.e. not matched or older than deltaDays). Args: searchIdxD (dict): CCD/BIRD search index dictionary expireDays (int, optional): expiration interval in days. Defaults to 0 (no expiration). updateUnmatched (bool, optional): include only matched identifiers (i.e. exclude any tried but unmatched cases) Returns: (list): chemical component/BIRD reference identifier list """ selectD = {} if expireDays > 0: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True, before={"days": expireDays}) selectD.update( {"rcsb_latest_update": { "$lt": tU.getDateTimeObj(tS) }}) # if updateUnmatched: # Return only cases with an existing correspondence selectD.update({"matched_ids": {"$exists": True}}) matchD = self.__getReferenceData(self.__databaseName, self.__matchIndexCollectionName, selectD=selectD if selectD else None) # # For the selected cases in the index- retIdList = [] if searchIdxD: # Exclude definitions if source InChIKey in the match index differs with the Key in the current search index. for ccId, inD in matchD.items(): if updateUnmatched and "matched_ids" not in inD: retIdList.append(ccId) continue hasChanged = False for mD in inD["matched_ids"]: if mD["source_index_name"] not in searchIdxD: hasChanged = True logger.info("Identifier %s no longer in search index", mD["source_index_name"]) break if mD["source_inchikey"] != searchIdxD[ mD["source_index_name"]]["inchi-key"]: logger.info( "Identifier %s InChIKey changed search index", mD["source_index_name"]) hasChanged = True break if not hasChanged: retIdList.append(ccId) # return sorted(retIdList)
def getRefData(self, expireDays=0): if not self.__refD: selectD = {} if expireDays > 0: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True, before={"days": expireDays}) selectD.update( {"rcsb_latest_update": { "$lt": tU.getDateTimeObj(tS) }}) self.__refD = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=selectD) # return self.__refD
def getRefIdCodes(self, expireDays=0): selectD = {} selectionList = ["rcsb_id"] if expireDays > 0: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True, before={"days": expireDays}) selectD.update( {"rcsb_latest_update": { "$lt": tU.getDateTimeObj(tS) }}) refIds = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=selectD, selectionList=selectionList) # return list(refIds.keys()) if refIds else []
def __getReferenceDataIds(self, expireDays=14): """Get reference data identifiers subject to an expiration interval (i.e. not updated in/older than deltaDays) Args: expireDays (int, optional): expiration interval in days. Defaults to 14. Returns: (list): reference identifier list """ selectD = None if expireDays > 0: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True, before={"days": expireDays}) selectD = {"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}} matchD = self.__getReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, selectD=selectD) return sorted(matchD.keys())
def testTimeStamps(self): """ Verify time stamp operations. """ try: tU = TimeUtil() tS = tU.getTimestamp(useUtc=True) logger.debug("TS (UTC) = %s(%d)", tS, len(tS)) self.assertTrue(len(tS) >= 32) # tS = tU.getTimestamp(useUtc=True, before={"days": 1}) logger.debug("TS yesterday (UTC) = %s(%d)", tS, len(tS)) self.assertTrue(len(tS) >= 32) # tS = tU.getTimestamp(useUtc=True, after={"days": 1}) logger.debug("TS tomorrow (UTC) = %s(%d)", tS, len(tS)) self.assertTrue(len(tS) >= 32) # tS = tU.getTimestamp(useUtc=False) logger.debug("TS = %s(%d)", tS, len(tS)) self.assertTrue(len(tS) >= 32) # self.assertTrue(ok) wS1 = tU.getCurrentWeekSignature() logger.debug("Current week signature %s", wS1) td = datetime.date.today() wS2 = tU.getWeekSignature(td.year, td.month, td.day) logger.debug("Computed week signature %s", wS2) self.assertEqual(wS1, wS2) # tS = tU.getTimestamp(useUtc=True) logger.debug("TS (UTC) = %s(%d)", tS, len(tS)) self.assertTrue(len(tS) >= 32) dt = tU.getDateTimeObj(tS) logger.debug("Recycled DT (UTC) %s", dt.isoformat(" ")) # tS = tU.getTimestamp(useUtc=False) logger.debug("TS (local) = %s(%d)", tS, len(tS)) self.assertTrue(len(tS) >= 32) # dt = tU.getDateTimeObj(tS) logger.debug("Recycled DT (local) %s", dt.isoformat(" ")) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def updateList(self, dataList, procName, optionsD, workingDir): """Update the input list of reference sequence identifiers and return matching diagnostics and reference feature data. """ _ = optionsD _ = workingDir saveText = optionsD.get("saveText", False) fetchLimit = optionsD.get("fetchLimit", None) refDbName = optionsD.get("refDbName", "UniProt") maxChunkSize = optionsD.get("maxChunkSize", 50) successList = [] retList1 = [] retList2 = [] diagList = [] emptyList = [] # try: tU = TimeUtil() idList = dataList[:fetchLimit] if fetchLimit else dataList logger.info("%s starting fetch for %d %s entries", procName, len(idList), refDbName) if refDbName == "UniProt": fobj = UniProtUtils(saveText=saveText) logger.debug("Maximum reference chunk size %d", maxChunkSize) refD, matchD = fobj.fetchList(idList, maxChunkSize=maxChunkSize) if len(matchD) == len(idList): for uId, tD in matchD.items(): tD["rcsb_id"] = uId.strip() tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp()) retList1.append(tD) for uId, tD in refD.items(): tD["rcsb_id"] = uId.strip() tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp()) retList2.append(tD) successList.extend(idList) self.__updateReferenceData(self.__refDatabaseName, self.__refDataCollectionName, retList2) self.__updateReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, retList1) else: logger.info("Failing with fetch for %d entries with matchD %r", len(idList), matchD) else: logger.error("Unsupported reference database %r", refDbName) except Exception as e: logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e)) logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2)) # return successList, emptyList, emptyList, diagList
def updateList(self, dataList, procName, optionsD, workingDir): """Update the input list of reference data identifiers (ChemicalIdentifier()) and return matching diagnostics and reference feature data. { "_id" : ObjectId("5e8dfb49eab967a0483a0472"), "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL CANONICAL ID (e.g. ATP, PRD_000100) "rcsb_last_update" : ISODate("2020-04-08T16:26:47.993+0000"), "matched_ids" : [ {"matched_id": "<external reference ID code>", "search_id_type" : "oe-smiles", "search_id_source": "model-xyz", 'source_index_name': <>, 'source_inchikey': <>, 'source_smiles': <>}, {"matched_id": "<external reference ID code>", "search_id_type": ... , "search_id_source": ... , ...} ] ] }, } // Failed matches are recorded with NO matchedIds: { "_id" : ObjectId("5e8dfb49eab967a0483a04a3"), "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL ID "rcsb_last_update" : ISODate("2020-04-08T16:26:48.025+0000"), } # """ _ = workingDir chunkSize = optionsD.get("chunkSize", 50) matchIdOnly = optionsD.get("matchIdOnly", True) # Path to store raw request data - exportPath = optionsD.get("exportPath", None) # successList = [] retList1 = [] retList2 = [] diagList = [] emptyList = [] # try: tU = TimeUtil() ccIdList = dataList numChunks = len(list(self.__chunker(ccIdList, chunkSize))) logger.info( "%s search starting for %d reference definitions (in chunks of length %d)", procName, len(ccIdList), chunkSize) for ii, ccIdChunk in enumerate(self.__chunker(ccIdList, chunkSize), 1): logger.info("%s starting chunk for %d of %d", procName, ii, numChunks) # tDL = [] tIdxDL = [] timeS = tU.getDateTimeObj(tU.getTimestamp()) for ccId in ccIdChunk: # Get various forms from the search index - chemIdList = self.__genChemIdList(ccId) tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS} # mL = [] for chemId in chemIdList: stA = time.time() ok, refDL = self.__pcU.assemble( chemId, exportPath=exportPath, matchIdOnly=matchIdOnly) # if not ok: etA = time.time() logger.debug( "Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA) # if ok and refDL: for tD in refDL: pcId = tD["cid"] inchiKey = ( self.__searchIdxD[ chemId.indexName]["inchi-key"] if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName] else None) smiles = (self.__searchIdxD[ chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[ chemId.indexName] else None) mL.append({ "matched_id": pcId, "search_id_type": chemId.identifierType, "search_id_source": chemId.identifierSource, "source_index_name": chemId.indexName, "source_smiles": smiles, "source_inchikey": inchiKey, }) # tD.update({"rcsb_id": pcId, "rcsb_last_update": timeS}) # tDL.append(tD) # if mL: tIdxD["matched_ids"] = mL successList.append(ccId) else: logger.info("No match result for any form of %s", ccId) # tIdxDL.append(tIdxD) # -- startTimeL = time.time() logger.info("Saving chunk %d (len=%d)", ii, len(ccIdChunk)) self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL) endTimeL = time.time() logger.info("Saved chunk %d (len=%d) in %.3f secs", ii, len(ccIdChunk), endTimeL - startTimeL) except Exception as e: logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e)) logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2)) # return successList, emptyList, emptyList, diagList
def updateList(self, dataList, procName, optionsD, workingDir): """Update the input list of reference data identifiers (ChemicalIdentifier()) and return matching diagnostics and reference feature data. """ _ = workingDir chunkSize = optionsD.get("chunkSize", 50) # Path to store raw request data - exportPath = optionsD.get("exportPath", None) # successList = [] retList1 = [] retList2 = [] diagList = [] emptyList = [] # - try: tU = TimeUtil() pcidList = dataList numChunks = len(list(self.__chunker(pcidList, chunkSize))) logger.info( "%s search starting for %d reference definitions (in chunks of length %d)", procName, len(pcidList), chunkSize) for ii, pcidChunk in enumerate(self.__chunker(pcidList, chunkSize), 1): logger.info("%s starting chunk for %d of %d", procName, ii, numChunks) tDL = [] timeS = tU.getDateTimeObj(tU.getTimestamp()) for pcid in pcidChunk: # chemId = ChemicalIdentifier(idCode=pcid, identifierType="cid", identifier=pcid, identifierSource="ccd-match") # stA = time.time() ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath) # if not ok: etA = time.time() logger.info( "Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA) # if ok and refDL: successList.append(pcid) for tD in refDL: tD.update({ "rcsb_id": tD["cid"], "rcsb_last_update": timeS }) tDL.append(tD) else: logger.info("No match result for any form of %s", pcid) # -- startTimeL = time.time() logger.info("Saving chunk %d (len=%d)", ii, len(pcidChunk)) self.__updateObjectStore(self.__databaseName, self.__refDataCollectionName, tDL) endTimeL = time.time() logger.info("Saved chunk %d (len=%d) in %.3f secs", ii, len(pcidChunk), endTimeL - startTimeL) except Exception as e: logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e)) logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2)) # return successList, emptyList, emptyList, diagList