示例#1
0
class UniProtCoreEtlWorker(object):
    """Prepare and load UniProt 'core' sequence reference data collections."""
    def __init__(self,
                 cfgOb,
                 cachePath,
                 useCache=True,
                 numProc=2,
                 chunkSize=10,
                 readBackCheck=False,
                 documentLimit=None,
                 doValidate=False,
                 verbose=False):
        self.__cfgOb = cfgOb
        self.__cachePath = cachePath
        self.__useCache = useCache
        self.__readBackCheck = readBackCheck
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        self.__documentLimit = documentLimit
        #
        self.__resourceName = "MONGO_DB"
        self.__verbose = verbose
        self.__statusList = []
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=self.__useCache)
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        self.__valInst = None
        self.__doValidate = doValidate
        #

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __getReferenceSequenceProvider(self):
        """ """
        try:
            rsaP = ReferenceSequenceAssignmentProvider(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                polymerType="Protein",
                referenceDatabaseName="UniProt",
                provSource="PDB",
                useCache=self.__useCache,
                cachePath=self.__cachePath,
                fetchLimit=self.__documentLimit,
                siftsAbbreviated="TEST",
            )
            ok = rsaP.testCache()
            return ok, rsaP
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def load(self, updateId, extResource, loadType="full"):
        """Load sequence reference data"""
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()
            #
            dList = indexL = []
            databaseName = collectionName = collectionVersion = None
            #
            if extResource == "UniProt":
                databaseName = "uniprot_core"
                # configName = self.__cfgOb.getDefaultSectionName()
                # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName()))
                #
                ok, rsP = self.__getReferenceSequenceProvider()
                if not ok:
                    return False
                #
                dList = rsP.getDocuments()
                logger.info("Resource %r extracted mapped document length %d",
                            extResource, len(dList))
                logger.debug("Objects %r", dList[:2])
                #
                cDL = self.__docHelper.getCollectionInfo(databaseName)
                collectionName = cDL[0]["NAME"]
                collectionVersion = cDL[0]["VERSION"]
                indexL = self.__docHelper.getDocumentIndexAttributes(
                    collectionName, "primary")
                logger.info(
                    "Database %r collection %r version %r index attributes %r",
                    databaseName, collectionName, collectionVersion, indexL)
                addValues = {}
            else:
                logger.error("Unsupported external resource %r", extResource)
            #
            if self.__doValidate:
                self.__valInst = self.__getValidator(databaseName,
                                                     collectionName,
                                                     schemaLevel="full")
                for dObj in dList:
                    self.__validateObj(databaseName,
                                       collectionName,
                                       dObj,
                                       label="Original")
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=indexL,
                         keyNames=None,
                         addValues=addValues)
            okS = self.__updateStatus(updateId, databaseName, collectionName,
                                      ok, statusStartTimestamp)

            return ok and okS
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def getLoadStatus(self):
        return self.__statusList

    def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
        # _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
        # cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True)
        logger.info("Fetch schema for %r %r validation level %r", databaseName,
                    collectionName, schemaLevel)
        cD = self.__schP.getJsonSchema(databaseName,
                                       collectionName,
                                       encodingType="JSON",
                                       level=schemaLevel)
        # Raises exceptions for schema compliance.
        Draft4Validator.check_schema(cD)
        valInst = Draft4Validator(cD, format_checker=FormatChecker())
        return valInst

    def __validateObj(self, databaseName, collectionName, rObj, label=""):
        try:
            eCount = 0
            tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
            for error in sorted(self.__valInst.iter_errors(rObj), key=str):
                logger.info(
                    "Database %s collection %s (%s %r) path %s error: %s",
                    databaseName, collectionName, label, tId, error.path,
                    error.message)
                logger.debug(">>> Failing object is %r", rObj)
                if "rcsb_uniprot_feature" in rObj:
                    for dd in rObj["rcsb_uniprot_feature"]:
                        if "feature_id" in dd:
                            logger.info("feature_id %r", dd["feature_id"])
                        else:
                            logger.info("no feature_id keys %r",
                                        sorted(dd.keys()))
                            logger.info("description %r", dd["description"])
                eCount += 1
        except Exception as e:
            logger.exception("Validation failing %s", str(e))

        return eCount
示例#2
0
class SchemaProviderTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=False)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")
        #
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testSchemaAccessDefault(self):
        for databaseName in self.__databaseNameList:
            cDL = self.__docHelper.getCollectionInfo(databaseName)
            for cD in cDL:
                collectionName = cD["NAME"]
                for encodingType in self.__encodingTypes:
                    if encodingType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        logger.debug("Loading ->%s %s %s %s", databaseName,
                                     collectionName, encodingType, level)
                        sD = self.__schP.getJsonSchema(
                            databaseName,
                            collectionName,
                            encodingType=encodingType,
                            level=level)
                        self.assertTrue(sD is not None)
示例#3
0
class DocumentLoader(object):
    def __init__(
        self,
        cfgOb,
        cachePath,
        resourceName="MONGO_DB",
        numProc=4,
        chunkSize=15,
        documentLimit=None,
        verbose=False,
        readBackCheck=False,
        maxStepLength=2000,
        schemaRebuildFlag=False,
    ):
        self.__verbose = verbose
        #
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__documentLimit = documentLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        self.__resourceName = resourceName
        #
        self.__cachePath = cachePath if cachePath else "."
        self.__schP = SchemaProvider(cfgOb,
                                     cachePath,
                                     useCache=True,
                                     rebuildFlag=schemaRebuildFlag)
        #
        self.__readBackCheck = readBackCheck
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"
        #
        #

    def load(self,
             databaseName,
             collectionName,
             loadType="full",
             documentList=None,
             indexAttributeList=None,
             keyNames=None,
             schemaLevel="full",
             addValues=None):
        """Driver method for loading MongoDb content -


        loadType:     "full" or "replace"

        """
        try:
            startTime = self.__begin(message="loading operation")
            #

            #
            optionsD = {}
            optionsD["collectionName"] = collectionName
            optionsD["databaseName"] = databaseName
            optionsD["readBackCheck"] = self.__readBackCheck
            optionsD["loadType"] = loadType
            optionsD["keyNames"] = keyNames
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            docList = documentList[:self.
                                   __documentLimit] if self.__documentLimit else documentList
            logger.debug("Full document list length %d limit %r",
                         len(documentList), self.__documentLimit)
            numProc = self.__numProc
            chunkSize = self.__chunkSize if docList and self.__chunkSize < len(
                docList) else 0
            #
            if addValues:
                try:
                    for doc in docList:
                        for k, v in addValues.items():
                            doc[k] = v
                except Exception as e:
                    logger.error("Add values %r fails with %s", addValues,
                                 str(e))

            #
            indAtList = indexAttributeList if indexAttributeList else []
            bsonSchema = None
            if schemaLevel and schemaLevel in ["min", "full"]:
                bsonSchema = self.__schP.getJsonSchema(databaseName,
                                                       collectionName,
                                                       encodingType="BSON",
                                                       level=schemaLevel)
                logger.debug("Using schema validation for %r %r %r",
                             databaseName, collectionName, schemaLevel)

            if loadType == "full":
                self.__removeCollection(databaseName, collectionName)
                ok = self.__createCollection(databaseName,
                                             collectionName,
                                             indAtList,
                                             bsonSchema=bsonSchema)
                logger.info("Collection %s create status %r", collectionName,
                            ok)
            elif loadType == "append":
                # create only if object does not exist -
                ok = self.__createCollection(databaseName,
                                             collectionName,
                                             indexAttributeNames=indAtList,
                                             checkExists=True,
                                             bsonSchema=bsonSchema)
                logger.debug("Collection %s create status %r", collectionName,
                             ok)
                # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numDocs = len(docList)
            logger.debug("Processing %d total documents", numDocs)
            numProc = min(numProc, numDocs)
            maxStepLength = self.__maxStepLength
            if numDocs > maxStepLength:
                numLists = int(numDocs / maxStepLength)
                subLists = [docList[i::numLists] for i in range(numLists)]
            else:
                subLists = [docList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            failList = []
            for ii, subList in enumerate(subLists):
                logger.debug("Running outer subtask %d of %d length %d",
                             ii + 1, len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optionsD)
                mpu.set(workerObj=self, workerMethod="loadWorker")
                ok, failListT, _, _ = mpu.runMulti(dataList=subList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
                failList.extend(failListT)
            logger.debug("Completed load with failing document list %r",
                         failList)
            logger.debug("Document list length %d failed load list length %d",
                         len(docList), len(failList))
            #

            self.__end(startTime, "loading operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False

    def loadWorker(self, dataList, procName, optionsD, workingDir):
        """Multi-proc worker method for MongoDb document loading -"""
        try:
            startTime = self.__begin(message=procName)
            readBackCheck = optionsD["readBackCheck"]
            loadType = optionsD["loadType"]

            collectionName = optionsD["collectionName"]
            databaseName = optionsD["databaseName"]
            keyNames = optionsD["keyNames"]
            #
            logger.debug("%s databaseName %s collectionName %s workingDir %s",
                         procName, databaseName, collectionName, workingDir)
            #
            if dataList:
                ok, successList, failedList = self.__loadDocuments(
                    databaseName,
                    collectionName,
                    dataList,
                    loadType=loadType,
                    readBackCheck=readBackCheck,
                    keyNames=keyNames)
            #
            logger.debug(
                "%s database %s collection %s inputList length %d successList length %d  failed %d",
                procName,
                databaseName,
                collectionName,
                len(dataList),
                len(successList),
                len(failedList),
            )
            #

            self.__end(startTime, procName + " with status " + str(ok))
            return successList, [], []

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return [], [], []

    # -------------- -------------- -------------- -------------- -------------- -------------- --------------
    #                                        ---  Supporting code follows ---
    #

    def __begin(self, message=""):
        startTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting %s at %s", message, ts)
        return startTime

    def __end(self, startTime, message=""):
        endTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        delta = endTime - startTime
        logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)

    def __createCollection(self,
                           dbName,
                           collectionName,
                           indexAttributeNames=None,
                           checkExists=False,
                           bsonSchema=None):
        """Create database and collection and optionally a primary index -"""
        try:
            logger.debug("Create database %s collection %s", dbName,
                         collectionName)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if checkExists and mg.databaseExists(
                        dbName) and mg.collectionExists(
                            dbName, collectionName):
                    ok1 = True
                else:
                    ok1 = mg.createCollection(dbName,
                                              collectionName,
                                              bsonSchema=bsonSchema)
                ok2 = mg.databaseExists(dbName)
                ok3 = mg.collectionExists(dbName, collectionName)
                okI = True
                if indexAttributeNames:
                    okI = mg.createIndex(dbName,
                                         collectionName,
                                         indexAttributeNames,
                                         indexName="primary",
                                         indexType="DESCENDING",
                                         uniqueFlag=False)

            return ok1 and ok2 and ok3 and okI
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __removeCollection(self, dbName, collectionName):
        """Drop collection within database"""
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                #
                logger.debug("Remove collection database %s collection %s",
                             dbName, collectionName)
                logger.debug("Starting databases = %r", mg.getDatabaseNames())
                logger.debug("Starting collections = %r",
                             mg.getCollectionNames(dbName))
                ok = mg.dropCollection(dbName, collectionName)
                logger.debug("Databases = %r", mg.getDatabaseNames())
                logger.debug("Post drop collections = %r",
                             mg.getCollectionNames(dbName))
                ok = mg.collectionExists(dbName, collectionName)
                logger.debug("Post drop collections = %r",
                             mg.getCollectionNames(dbName))
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __loadDocuments(self,
                        dbName,
                        collectionName,
                        docList,
                        loadType="full",
                        readBackCheck=False,
                        keyNames=None):
        #
        # Load database/collection with input document list -
        #
        failList = []
        rIdL = []
        successList = []
        logger.debug(
            "Loading dbName %s collectionName %s with document count %d keynames %r",
            dbName, collectionName, len(docList), keyNames)
        if keyNames:
            # map the document list to some document key if this is provided
            indD = {}
            indL = []
            try:
                for ii, doc in enumerate(docList):
                    dIdTup = self.__getKeyValues(doc, keyNames)
                    indD[dIdTup] = ii
                indL = list(range(len(docList)))
            except Exception as e:
                logger.exception("Failing ii %d d %r with %s", ii, doc, str(e))
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                #
                if loadType == "replace" and keyNames:
                    dTupL = mg.deleteList(dbName, collectionName, docList,
                                          keyNames)
                    logger.debug("Deleted document status %r", (dTupL, ))
                #
                rIdL = mg.insertList(dbName,
                                     collectionName,
                                     docList,
                                     keyNames=keyNames)
                logger.debug("Insert returns rIdL length %r", len(rIdL))

                # ---
                #  If there is a failure then determine the specific successes and failures -
                #
                successList = docList
                failList = []
                if len(rIdL) != len(docList):
                    if keyNames:
                        successIndList = []
                        for rId in rIdL:
                            rObj = mg.fetchOne(dbName, collectionName, "_id",
                                               rId)
                            dIdTup = self.__getKeyValues(rObj, keyNames)
                            successIndList.append(indD[dIdTup])
                        failIndList = list(set(indL) - set(successIndList))
                        failList = [docList[ii] for ii in failIndList]
                        successList = [docList[ii] for ii in successIndList]
                    else:
                        # fail the whole batch if we don't have visibility into each document
                        failList = docList
                        successList = []
                #
                rbStatus = True
                if readBackCheck and keyNames:
                    #
                    # Note that objects in docList are mutated by the insert operation with the additional key '_id',
                    # hence, it is possible to compare the fetched object with the input object.
                    #
                    for ii, rId in enumerate(rIdL):
                        rObj = mg.fetchOne(dbName, collectionName, "_id", rId)
                        dIdTup = self.__getKeyValues(rObj, keyNames)
                        jj = indD[dIdTup]
                        if rObj != docList[jj]:
                            rbStatus = False
                            break
                #
                if readBackCheck and not rbStatus:
                    return False, successList, failList
                #
            return len(rIdL) == len(docList), successList, failList
        except Exception as e:
            logger.exception("Failing %r %r (len=%d) %s with %s", dbName,
                             collectionName, len(docList), keyNames, str(e))
        return False, [], docList

    def __getKeyValues(self, dct, keyNames):
        """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation.

        Args:
            dct (dict): source dictionary object (nested)
            keyNames (list): list of dictionary keys in dot notatoin

        Returns:
            tuple: tuple of values corresponding to the input key names

        """
        rL = []
        try:
            for keyName in keyNames:
                rL.append(self.__getKeyValue(dct, keyName))
        except Exception as e:
            logger.exception("Failing for key names %r with %s", keyNames,
                             str(e))

        return tuple(rL)

    def __getKeyValue(self, dct, keyName):
        """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested)."""
        try:
            kys = keyName.split(".")
            for key in kys:
                try:
                    dct = dct[key]
                except KeyError:
                    return None
            return dct
        except Exception as e:
            logger.exception("Failing for key %r with %s", keyName, str(e))

        return None