Пример #1
0
 def testExtractEntriesBefore(self):
     """Test case - extract entries subject to date restriction"""
     try:
         tU = TimeUtil()
         tS = tU.getTimestamp(useUtc=True, before={"days": 365 * 5})
         tD = tU.getDateTimeObj(tS)
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName="pdbx_core",
             collectionName="pdbx_core_entry",
             useCache=False,
             keyAttribute="entry",
             uniqueAttributes=["rcsb_id"],
             selectionQuery={
                 "rcsb_accession_info.initial_release_date": {
                     "$gt": tD
                 }
             },
             selectionList=["rcsb_id", "rcsb_accession_info"],
         )
         eD = obEx.getObjects()
         eCount = obEx.getCount()
         logger.info("Entry count is %d", eCount)
         logger.info("Entries are %r", list(eD.keys()))
         self.assertGreaterEqual(eCount, 6)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
    def __getMatchIndexIds(self,
                           searchIdxD,
                           expireDays=0,
                           updateUnmatched=True):
        """Get CCD/BIRD reference data identifiers in the current match index subject to an
           expiration interval (i.e. not matched or older than deltaDays).

        Args:
            searchIdxD (dict): CCD/BIRD search index dictionary
            expireDays (int, optional): expiration interval in days. Defaults to 0 (no expiration).
            updateUnmatched (bool, optional): include only matched identifiers (i.e. exclude any tried but unmatched cases)

        Returns:
            (list): chemical component/BIRD reference identifier list
        """
        selectD = {}
        if expireDays > 0:
            tU = TimeUtil()
            tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
            selectD.update(
                {"rcsb_latest_update": {
                    "$lt": tU.getDateTimeObj(tS)
                }})
        #
        if updateUnmatched:
            # Return only cases with an existing correspondence
            selectD.update({"matched_ids": {"$exists": True}})
        matchD = self.__getReferenceData(self.__databaseName,
                                         self.__matchIndexCollectionName,
                                         selectD=selectD if selectD else None)
        #
        # For the selected cases in the index-
        retIdList = []
        if searchIdxD:
            # Exclude definitions if source InChIKey in the match index differs with the Key in the current search index.
            for ccId, inD in matchD.items():
                if updateUnmatched and "matched_ids" not in inD:
                    retIdList.append(ccId)
                    continue
                hasChanged = False
                for mD in inD["matched_ids"]:
                    if mD["source_index_name"] not in searchIdxD:
                        hasChanged = True
                        logger.info("Identifier %s no longer in search index",
                                    mD["source_index_name"])
                        break
                    if mD["source_inchikey"] != searchIdxD[
                            mD["source_index_name"]]["inchi-key"]:
                        logger.info(
                            "Identifier %s InChIKey changed search index",
                            mD["source_index_name"])
                        hasChanged = True
                        break
                if not hasChanged:
                    retIdList.append(ccId)
        #
        return sorted(retIdList)
Пример #3
0
 def getRefData(self, expireDays=0):
     if not self.__refD:
         selectD = {}
         if expireDays > 0:
             tU = TimeUtil()
             tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
             selectD.update(
                 {"rcsb_latest_update": {
                     "$lt": tU.getDateTimeObj(tS)
                 }})
         self.__refD = self.__getReferenceData(self.__databaseName,
                                               self.__refDataCollectionName,
                                               selectD=selectD)
         #
     return self.__refD
Пример #4
0
 def getRefIdCodes(self, expireDays=0):
     selectD = {}
     selectionList = ["rcsb_id"]
     if expireDays > 0:
         tU = TimeUtil()
         tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
         selectD.update(
             {"rcsb_latest_update": {
                 "$lt": tU.getDateTimeObj(tS)
             }})
     refIds = self.__getReferenceData(self.__databaseName,
                                      self.__refDataCollectionName,
                                      selectD=selectD,
                                      selectionList=selectionList)
     #
     return list(refIds.keys()) if refIds else []
Пример #5
0
    def __getReferenceDataIds(self, expireDays=14):
        """Get reference data identifiers subject to an expiration interval
         (i.e. not updated in/older than deltaDays)

        Args:
            expireDays (int, optional): expiration interval in days. Defaults to 14.

        Returns:
            (list): reference identifier list
        """
        selectD = None
        if expireDays > 0:
            tU = TimeUtil()
            tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
            selectD = {"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}}
        matchD = self.__getReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, selectD=selectD)
        return sorted(matchD.keys())
Пример #6
0
    def testTimeStamps(self):
        """ Verify time stamp operations.
        """
        try:
            tU = TimeUtil()
            tS = tU.getTimestamp(useUtc=True)
            logger.debug("TS (UTC) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            #
            tS = tU.getTimestamp(useUtc=True, before={"days": 1})
            logger.debug("TS yesterday (UTC) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            #
            tS = tU.getTimestamp(useUtc=True, after={"days": 1})
            logger.debug("TS tomorrow (UTC) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            #
            tS = tU.getTimestamp(useUtc=False)
            logger.debug("TS = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)

            # self.assertTrue(ok)
            wS1 = tU.getCurrentWeekSignature()
            logger.debug("Current week signature %s", wS1)
            td = datetime.date.today()

            wS2 = tU.getWeekSignature(td.year, td.month, td.day)
            logger.debug("Computed week signature %s", wS2)
            self.assertEqual(wS1, wS2)
            #
            tS = tU.getTimestamp(useUtc=True)
            logger.debug("TS (UTC) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            dt = tU.getDateTimeObj(tS)
            logger.debug("Recycled DT (UTC) %s", dt.isoformat(" "))
            #
            tS = tU.getTimestamp(useUtc=False)
            logger.debug("TS (local) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            #
            dt = tU.getDateTimeObj(tS)
            logger.debug("Recycled DT (local) %s", dt.isoformat(" "))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Пример #7
0
 def updateList(self, dataList, procName, optionsD, workingDir):
     """Update the input list of reference sequence identifiers and return
     matching diagnostics and reference feature data.
     """
     _ = optionsD
     _ = workingDir
     saveText = optionsD.get("saveText", False)
     fetchLimit = optionsD.get("fetchLimit", None)
     refDbName = optionsD.get("refDbName", "UniProt")
     maxChunkSize = optionsD.get("maxChunkSize", 50)
     successList = []
     retList1 = []
     retList2 = []
     diagList = []
     emptyList = []
     #
     try:
         tU = TimeUtil()
         idList = dataList[:fetchLimit] if fetchLimit else dataList
         logger.info("%s starting fetch for %d %s entries", procName, len(idList), refDbName)
         if refDbName == "UniProt":
             fobj = UniProtUtils(saveText=saveText)
             logger.debug("Maximum reference chunk size %d", maxChunkSize)
             refD, matchD = fobj.fetchList(idList, maxChunkSize=maxChunkSize)
             if len(matchD) == len(idList):
                 for uId, tD in matchD.items():
                     tD["rcsb_id"] = uId.strip()
                     tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp())
                     retList1.append(tD)
                 for uId, tD in refD.items():
                     tD["rcsb_id"] = uId.strip()
                     tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp())
                     retList2.append(tD)
                 successList.extend(idList)
                 self.__updateReferenceData(self.__refDatabaseName, self.__refDataCollectionName, retList2)
                 self.__updateReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, retList1)
             else:
                 logger.info("Failing with fetch for %d entries with matchD %r", len(idList), matchD)
         else:
             logger.error("Unsupported reference database %r", refDbName)
     except Exception as e:
         logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
     logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
     #
     return successList, emptyList, emptyList, diagList
    def updateList(self, dataList, procName, optionsD, workingDir):
        """Update the input list of reference data identifiers (ChemicalIdentifier()) and return
        matching diagnostics and reference feature data.
        {
               "_id" : ObjectId("5e8dfb49eab967a0483a0472"),
               "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL CANONICAL ID (e.g. ATP, PRD_000100)
               "rcsb_last_update" : ISODate("2020-04-08T16:26:47.993+0000"),
               "matched_ids" : [
                   {"matched_id":  "<external reference ID code>", "search_id_type" : "oe-smiles", "search_id_source": "model-xyz",
                                   'source_index_name': <>, 'source_inchikey': <>, 'source_smiles': <>},
                   {"matched_id":  "<external reference ID code>", "search_id_type": ... , "search_id_source": ... , ...}
                   ]                          ]
               },
           }
           // Failed matches are recorded with NO matchedIds:
           {
               "_id" : ObjectId("5e8dfb49eab967a0483a04a3"),
               "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL ID
               "rcsb_last_update" : ISODate("2020-04-08T16:26:48.025+0000"),
           }
           #
        """
        _ = workingDir
        chunkSize = optionsD.get("chunkSize", 50)
        matchIdOnly = optionsD.get("matchIdOnly", True)
        # Path to store raw request data -
        exportPath = optionsD.get("exportPath", None)
        #
        successList = []
        retList1 = []
        retList2 = []
        diagList = []
        emptyList = []
        #
        try:
            tU = TimeUtil()
            ccIdList = dataList
            numChunks = len(list(self.__chunker(ccIdList, chunkSize)))
            logger.info(
                "%s search starting for %d reference definitions (in chunks of length %d)",
                procName, len(ccIdList), chunkSize)
            for ii, ccIdChunk in enumerate(self.__chunker(ccIdList, chunkSize),
                                           1):
                logger.info("%s starting chunk for %d of %d", procName, ii,
                            numChunks)
                # tDL = []
                tIdxDL = []
                timeS = tU.getDateTimeObj(tU.getTimestamp())
                for ccId in ccIdChunk:
                    # Get various forms from the search index -
                    chemIdList = self.__genChemIdList(ccId)
                    tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
                    #
                    mL = []
                    for chemId in chemIdList:
                        stA = time.time()
                        ok, refDL = self.__pcU.assemble(
                            chemId,
                            exportPath=exportPath,
                            matchIdOnly=matchIdOnly)
                        #
                        if not ok:
                            etA = time.time()
                            logger.debug(
                                "Failing %s search source %s for %s (%.4f secs)",
                                chemId.identifierType, chemId.identifierSource,
                                chemId.idCode, etA - stA)

                        #
                        if ok and refDL:
                            for tD in refDL:
                                pcId = tD["cid"]
                                inchiKey = (
                                    self.__searchIdxD[
                                        chemId.indexName]["inchi-key"]
                                    if chemId.indexName in self.__searchIdxD
                                    and "inchi-key"
                                    in self.__searchIdxD[chemId.indexName] else
                                    None)
                                smiles = (self.__searchIdxD[
                                    chemId.indexName]["smiles"] if
                                          chemId.indexName in self.__searchIdxD
                                          and "smiles" in self.__searchIdxD[
                                              chemId.indexName] else None)
                                mL.append({
                                    "matched_id": pcId,
                                    "search_id_type": chemId.identifierType,
                                    "search_id_source":
                                    chemId.identifierSource,
                                    "source_index_name": chemId.indexName,
                                    "source_smiles": smiles,
                                    "source_inchikey": inchiKey,
                                })
                                # tD.update({"rcsb_id": pcId, "rcsb_last_update": timeS})
                                # tDL.append(tD)
                    #
                    if mL:
                        tIdxD["matched_ids"] = mL
                        successList.append(ccId)
                    else:
                        logger.info("No match result for any form of %s", ccId)
                    #
                    tIdxDL.append(tIdxD)
                # --
                startTimeL = time.time()
                logger.info("Saving chunk %d (len=%d)", ii, len(ccIdChunk))
                self.__updateObjectStore(self.__databaseName,
                                         self.__matchIndexCollectionName,
                                         tIdxDL)
                endTimeL = time.time()
                logger.info("Saved chunk %d (len=%d) in %.3f secs", ii,
                            len(ccIdChunk), endTimeL - startTimeL)
        except Exception as e:
            logger.exception("Failing %s for %d data items %s", procName,
                             len(dataList), str(e))
        logger.info("%s dataList length %d success length %d rst1 %d rst2 %d",
                    procName, len(dataList), len(successList), len(retList1),
                    len(retList2))
        #
        return successList, emptyList, emptyList, diagList
Пример #9
0
    def updateList(self, dataList, procName, optionsD, workingDir):
        """Update the input list of reference data identifiers (ChemicalIdentifier()) and return
        matching diagnostics and reference feature data.

        """
        _ = workingDir
        chunkSize = optionsD.get("chunkSize", 50)
        # Path to store raw request data -
        exportPath = optionsD.get("exportPath", None)
        #
        successList = []
        retList1 = []
        retList2 = []
        diagList = []
        emptyList = []
        # -
        try:
            tU = TimeUtil()
            pcidList = dataList
            numChunks = len(list(self.__chunker(pcidList, chunkSize)))
            logger.info(
                "%s search starting for %d reference definitions (in chunks of length %d)",
                procName, len(pcidList), chunkSize)
            for ii, pcidChunk in enumerate(self.__chunker(pcidList, chunkSize),
                                           1):
                logger.info("%s starting chunk for %d of %d", procName, ii,
                            numChunks)
                tDL = []
                timeS = tU.getDateTimeObj(tU.getTimestamp())
                for pcid in pcidChunk:
                    #
                    chemId = ChemicalIdentifier(idCode=pcid,
                                                identifierType="cid",
                                                identifier=pcid,
                                                identifierSource="ccd-match")
                    #
                    stA = time.time()
                    ok, refDL = self.__pcU.assemble(chemId,
                                                    exportPath=exportPath)
                    #
                    if not ok:
                        etA = time.time()
                        logger.info(
                            "Failing %s search source %s for %s (%.4f secs)",
                            chemId.identifierType, chemId.identifierSource,
                            chemId.idCode, etA - stA)

                    #
                    if ok and refDL:
                        successList.append(pcid)
                        for tD in refDL:
                            tD.update({
                                "rcsb_id": tD["cid"],
                                "rcsb_last_update": timeS
                            })
                            tDL.append(tD)
                    else:
                        logger.info("No match result for any form of %s", pcid)
                # --
                startTimeL = time.time()
                logger.info("Saving chunk %d (len=%d)", ii, len(pcidChunk))
                self.__updateObjectStore(self.__databaseName,
                                         self.__refDataCollectionName, tDL)
                endTimeL = time.time()
                logger.info("Saved chunk %d (len=%d) in %.3f secs", ii,
                            len(pcidChunk), endTimeL - startTimeL)
        except Exception as e:
            logger.exception("Failing %s for %d data items %s", procName,
                             len(dataList), str(e))
        logger.info("%s dataList length %d success length %d rst1 %d rst2 %d",
                    procName, len(dataList), len(successList), len(retList1),
                    len(retList2))
        #
        return successList, emptyList, emptyList, diagList