class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__numProc = 2
        # self.__fileLimit = 200
        self.__fileLimit = None
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = True
        #
        self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        #self.__extraOpts = None
        # The following for extended parent/child info -
        self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"

        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        self.assertLessEqual(eCount, 1)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]}
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
            logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = True

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        #schemaLevel = "full"
        schemaLevel = "min"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes
                )
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        #for error in sorted(valInfo.iter_errors(dD), key=str):
                        #    logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message)
                        #    logger.debug("Failing document %d : %r", ii, list(dD.items()))
                        #    eCount += 1
                        #    cCount += 1
                        #if cCount > 0:
                        #    logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s", str(e))

        return eCount

    def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files
        """
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif")
                    #self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            docList, containerNameList, _ = sdp.processDocuments(
                containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName
            )

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                # for ii, doc in enumerate(docList[:1]):
                for ii, doc in enumerate(docList):
                    cn = containerNameList[ii]
                    fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS))
                    self.__mU.doExport(fp, [doc], fmt="json", indent=3)
                    logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #2
0
class SchemaDefAccessTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True,
                                     clearPath=False)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testAccess(self):
        databaseNames = ["pdbx_core", "bird_chem_comp_core"]
        dataTypingList = ["ANY", "SQL"]
        for databaseName in databaseNames:
            for dataTyping in dataTypingList:
                self.__testAccess(databaseName, dataTyping)

    def __testAccess(self, databaseName, dataTyping):
        try:
            sD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=False)
            ok = self.__testAccessors(sD)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}

    def __testAccessors(self, schemaDef):
        """Verify data and accessor mapping -"""

        sd = SchemaDefAccess(schemaDef)
        logger.debug("Schema name %s", sd.getName())
        logger.debug("Schema name %s", sd.getAppName())

        logger.debug("Database name %s", sd.getDatabaseName())
        logger.debug("Versioned database name %s",
                     sd.getVersionedDatabaseName())

        logger.debug("Collection info %r", sd.getCollectionInfo())

        for dS in sd.getDataSelectorNames():
            logger.debug("Selector %s %r", dS, sd.getDataSelectors(dS))

        collectionInfoL = sd.getCollectionInfo()
        for dD in collectionInfoL:
            collectionName = dD["NAME"]

            logger.debug("Collection excluded %r",
                         sd.getCollectionExcluded(collectionName))
            logger.debug("Collection included %r",
                         sd.getCollectionSelected(collectionName))
            logger.debug("Collection document key attribute names %r",
                         sd.getDocumentKeyAttributeNames(collectionName))

        schemaIdList = sd.getSchemaIdList()
        for schemaId in schemaIdList:
            #
            aIdL = sd.getAttributeIdList(schemaId)
            tObj = sd.getSchemaObject(schemaId)
            attributeIdList = tObj.getAttributeIdList()
            self.assertEqual(len(aIdL), len(attributeIdList))
            attributeNameList = tObj.getAttributeNameList()
            logger.debug("Ordered attribute Id   list %s",
                         str(attributeIdList))
            logger.debug("Ordered attribute name list %s",
                         str(attributeNameList))
            #
            mAL = tObj.getMapAttributeNameList()
            logger.debug("Ordered mapped attribute name list %s", str(mAL))

            mAL = tObj.getMapAttributeIdList()
            logger.debug("Ordered mapped attribute id   list %s", str(mAL))

            cL = tObj.getMapInstanceCategoryList()
            logger.debug("Mapped category list %s", str(cL))

            for cV in cL:
                aL = tObj.getMapInstanceAttributeList(cV)
                logger.debug("Mapped attribute list in %s :  %s", cV, str(aL))
        return True
예제 #3
0
class SchemaDefLoaderDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoaderDbTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__verbose = True
        #
        fileLimit = 100
        numProc = 2
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__resourceName = "MYSQL_DB"
        #
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=numProc,
                                        fileLimit=fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __schemaCreate(self, schemaDefObj):
        """Create table schema using schema definition"""
        try:
            tableIdList = schemaDefObj.getSchemaIdList()
            sqlGen = SqlGenAdmin(self.__verbose)
            sqlL = sqlGen.createDatabaseSQL(schemaDefObj.getDatabaseName())
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getSchemaObject(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                myQ = MyDbQuery(dbcon=client, verbose=self.__verbose)
                #
                # Permit warnings to support "drop table if exists" for missing tables.
                #
                myQ.setWarning("ignore")
                ret = myQ.sqlCommand(sqlCommandList=sqlL)
                logger.debug("\n\n+INFO mysql server returns %r\n", ret)
                self.assertTrue(ret)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    # ------------- - -------------------------------------------------------------------------------------------
    def testSchemaCreate(self):
        """Create table schema for BIRD, chemical component, and PDBx data."""
        cD = self.__schP.makeSchemaDef("bird",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        cD = self.__schP.makeSchemaDef("chem_comp",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        # cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True)
        # sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)

    def testLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                sdl.load(inputPathList=inputPathList, loadType="batch-file")
                #
                logger.debug(
                    "INFO BATCH FILE RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="all")
                self.assertTrue(ok)
                #
                logger.debug(
                    "\n\n\n+INFO BATCH INSERT RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="selected")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompReference(self):
        try:
            cD = self.__schP.makeSchemaDef("chem_comp",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(
                contentType="chem_comp")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("Disable test - schema not optimized for mysql limitations")
    def testLoadPdbxFiles(self):
        try:
            cD = self.__schP.makeSchemaDef("pdbx",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="pdbx")
            logger.debug("Input path list %r", inputPathList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-insert",
                              deleteOpt="all")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #4
0
class SchemaDefBuildTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=False)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            # self.__databaseNameList = ["repository_holdings"]
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")

        # self.__databaseNameList = ["sequence_clusters"]
        self.__saveSchema = True
        self.__compareDefSchema = False
        self.__compareSchema = False
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testBuildSchemaDefs(self):
        try:
            for databaseName in self.__databaseNameList:
                for dataTyping in self.__dataTypingList:
                    logger.debug("Building schema %s with types %s",
                                 databaseName, dataTyping)
                    self.__schP.makeSchemaDef(databaseName,
                                              dataTyping=dataTyping,
                                              saveSchema=self.__saveSchema)
                    if self.__compareDefSchema:
                        self.__schP.schemaDefCompare(databaseName, dataTyping)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testBuildCollectionSchema(self):
        schemaDifPathList = []
        for databaseName in self.__databaseNameList:
            dD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping="ANY",
                                           saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in self.__encodingTypes:
                    if encodingType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        self.__schP.makeSchema(databaseName,
                                               collectionName,
                                               encodingType=encodingType,
                                               level=level,
                                               saveSchema=self.__saveSchema)
                        if self.__compareSchema and encodingType.lower(
                        ) == "json":
                            pth = self.__schP.jsonSchemaCompare(
                                databaseName, collectionName, encodingType,
                                level)
                            if pth:
                                schemaDifPathList.append(pth)
        if schemaDifPathList:
            logger.info("Path dif list %r", schemaDifPathList)

    def testCompareSchema(self):
        databaseName = "pdbx_core"
        collectionName = "pdbx_core_entry"
        encodingType = "json"
        level = "full"
        #
        oldPath = os.path.join(
            HERE, "test-saved-output",
            "json-full-db-pdbx_core-col-pdbx_core_entry.json")
        mU = MarshalUtil(workPath=os.path.join(HERE, "test-output"))
        sOld = mU.doImport(oldPath, fmt="json")
        sNew = self.__schP.makeSchema(databaseName,
                                      collectionName,
                                      encodingType=encodingType,
                                      level=level)
        numDif, difD = self.__schP.schemaCompare(sOld, sNew)
        logger.debug("numDiffs %d", numDif)
        self.assertGreaterEqual(numDif, 141)
        self.assertGreaterEqual(len(difD["changed"]), 160)
        logger.debug("difD %r", difD)

    @unittest.skip("Deprecated test")
    def testCompareSchemaCategories(self):
        """Compare common categories across schema definitions."""
        try:
            sdCc = SchemaDefAccess(
                self.__schP.makeSchemaDef("chem_comp_core",
                                          dataTyping="ANY",
                                          saveSchema=False))
            sdBcc = SchemaDefAccess(
                self.__schP.makeSchemaDef("bird_chem_comp_core",
                                          dataTyping="ANY",
                                          saveSchema=False))
            #
            logger.info("")
            for schemaId in ["CHEM_COMP", "PDBX_CHEM_COMP_AUDIT"]:
                atCcL = sdCc.getAttributeIdList(schemaId)
                atBCcL = sdBcc.getAttributeIdList(schemaId)

                logger.debug("%s attributes (%d) %r", schemaId, len(atCcL),
                             atCcL)
                logger.debug("%s attributes (%d) %r", schemaId, len(atBCcL),
                             atBCcL)

                sDif = set(atCcL) - set(atBCcL)
                if sDif:
                    logger.info("For %s attribute differences %r", schemaId,
                                sDif)
                self.assertEqual(len(sDif), 0)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testBuildColSchemaWithRefs(self):
        for databaseName in ["ihm_dev_full"]:
            dD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping="ANY",
                                           saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for schemaType in self.__encodingTypes:
                    if schemaType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        self.__schP.makeSchema(
                            databaseName,
                            collectionName,
                            encodingType=schemaType,
                            level=level,
                            saveSchema=True,
                            extraOpts="addParentRefs|addPrimaryKey")
class RepoHoldingsDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        schemaLevel = "full"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    @unittest.skip("Troubleshooting test")
    def testValidateOptsMin(self):
        updateId = self.__updateId
        schemaLevel = "min"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, schemaLevel="full"):
        schemaNames = ["repository_holdings"]
        collectionNames = {
            "repository_holdings": [
                "repository_holdings_update_entry",
                "repository_holdings_current_entry",
                "repository_holdings_unreleased_entry",
                "repository_holdings_removed_entry",
                "repository_holdings_combined_entry",
            ],
            "entity_sequence_clusters":
            ["cluster_members", "cluster_provenance", "entity_members"],
        }
        #
        eCount = 0
        for schemaName in schemaNames:
            for collectionName in collectionNames[schemaName]:
                _ = self.__schP.makeSchemaDef(schemaName,
                                              dataTyping="ANY",
                                              saveSchema=True)
                cD = self.__schP.makeSchema(schemaName,
                                            collectionName,
                                            encodingType="JSON",
                                            level=schemaLevel,
                                            saveSchema=True)
                dL = self.__getRepositoryHoldingsDocuments(
                    schemaName, collectionName, updateId)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            collectionName + ".json")
                    self.__mU.doExport(savePath, dL, fmt="json", indent=3)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d",
                                 schemaName, collectionName, ii)
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info(
                                "schema %s collection %s path %s error: %s",
                                schemaName, collectionName, error.path,
                                error.message)
                            logger.info(">>>")
                            logger.info(">>> failing object is %r", dD)
                            logger.info(">>>")
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d",
                                     schemaName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __getRepositoryHoldingsDocuments(self, schemaName, collectionName,
                                         updateId):
        """Test loading and processing operations for legacy holdings and status data."""
        rL = []
        try:
            rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                        sandboxPath=self.__sandboxPath,
                                        workPath=self.__cachePath)
            if collectionName == "repository_holdings_update_entry":
                rL = rhdp.getHoldingsUpdateEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("update data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_current_entry":
                rL = rhdp.getHoldingsCurrentEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_unreleased_entry":
                rL = rhdp.getHoldingsUnreleasedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("unreleased data length %r", len(rL))
            #
            elif collectionName in ["repository_holdings_removed_entry"]:
                rL = rhdp.getHoldingsRemovedEntry(updateId=updateId)
                if collectionName == "repository_holdings_removed":
                    self.assertGreaterEqual(len(rL), 10)
                    logger.debug("removed data length %r", len(rL))
            elif collectionName == "repository_holdings_combined_entry":
                rL = rhdp.getHoldingsCombinedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))

            #
        except Exception as e:
            logger.exception("%s %s failing with %s", schemaName,
                             collectionName, str(e))
            self.fail()

        return rL
예제 #6
0
class SchemaDefCompareTests(unittest.TestCase):
    skipFlag = True

    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            # self.__databaseNameList = ["repository_holdings"]
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testCompareSchemaDefs(self):
        try:
            difPathList = []
            for databaseName in self.__databaseNameList:
                for dataTyping in self.__dataTypingList:
                    logger.debug("Building schema %s with types %s",
                                 databaseName, dataTyping)
                    pth = self.__schP.schemaDefCompare(databaseName,
                                                       dataTyping)
                    if pth:
                        difPathList.append(pth)
            if difPathList:
                logger.info("Schema definition difference path list %r",
                            [os.path.split(pth)[1] for pth in difPathList])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testCompareCollectionSchema(self):
        try:
            difPathList = []
            for databaseName in self.__databaseNameList:
                dD = self.__schP.makeSchemaDef(databaseName,
                                               dataTyping="ANY",
                                               saveSchema=False)
                sD = SchemaDefAccess(dD)
                for cd in sD.getCollectionInfo():
                    collectionName = cd["NAME"]
                    for encodingType in self.__encodingTypes:
                        if encodingType.lower() != "json":
                            continue
                        for level in self.__validationLevels:
                            pth = self.__schP.jsonSchemaCompare(
                                databaseName, collectionName, encodingType,
                                level)
                            if pth:
                                difPathList.append(pth)
            if difPathList:
                logger.info("JSON schema difference path list %r",
                            [os.path.split(pth)[1] for pth in difPathList])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #7
0
class ClusterDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        #
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        #
        self.__dataSetId = "2018_23"
        self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        # self.__levels = ['100', '95', '90', '70', '50', '30']
        self.__levels = ["100"]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        validationLevel = "full"
        eCount = self.__testValidateOpts(updateId, validationLevel=validationLevel)
        logger.info("Total validation errors validation level %s : %d", validationLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, validationLevel="full"):
        _ = updateId
        databaseNames = ["sequence_clusters"]
        collectionNames = {"sequence_clusters": ["cluster_provenance", "cluster_members", "entity_members"]}
        #
        eCount = 0
        for databaseName in databaseNames:
            for collectionName in collectionNames[databaseName]:
                _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=validationLevel, saveSchema=True)
                #
                dL = self.__getSequenceClusterData(collectionName, levels=self.__levels, dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for _, dD in enumerate(dL):
                    # logger.debug("Schema %s collection %s document %d" % (schemaName, collectionName, ii))
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info("schema %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message)
                            logger.info(">>> failing object is %r", dD)
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d", databaseName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __fetchProvenance(self):
        """Test case for fetching a provenance dictionary content."""
        try:
            provKeyName = "rcsb_entity_sequence_cluster_prov"
            provU = ProvenanceProvider(self.__cfgOb, self.__cachePath, useCache=True)
            pD = provU.fetch()
            return pD[provKeyName] if provKeyName in pD else {}
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __getSequenceClusterData(self, collectionName, dataSetId=None, dataLocator=None, levels=None):
        """Test extraction on an example sequence cluster data set."""
        try:
            #
            if collectionName == "cluster_provenance":
                return [self.__fetchProvenance()]
            #
            entitySchemaName = "rcsb_entity_sequence_cluster_list"
            clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list"
            cdp = ClusterDataPrep(workPath=self.__cachePath, entitySchemaName=entitySchemaName, clusterSchemaName=clusterSchemaName)
            cifD, docBySequenceD, docByClusterD = cdp.extract(dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity")
            self.assertEqual(len(cifD), 1)
            self.assertEqual(len(docBySequenceD), 1)
            self.assertEqual(len(docByClusterD), 1)
            if collectionName == "entity_members":
                return docBySequenceD[entitySchemaName]
            elif collectionName == "cluster_members":
                return docByClusterD[clusterSchemaName]

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return None
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--update_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_chem_comp_core_ref",
        default=False,
        action="store_true",
        help="Update core schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Update core schema for Bird Chemical Component reference definitions")

    parser.add_argument("--update_bird_ref",
                        default=False,
                        action="store_true",
                        help="Update schema for Bird reference definitions")
    parser.add_argument(
        "--update_bird_family_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Family reference definitions")

    parser.add_argument("--update_pdbx",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx entry data")
    parser.add_argument("--update_pdbx_core",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx core entry/entity data")
    parser.add_argument(
        "--update_pdbx_comp_model_core",
        default=False,
        action="store_true",
        help="Update schema for PDBx computational model core entry/entity data"
    )
    #
    parser.add_argument("--update_repository_holdings",
                        default=False,
                        action="store_true",
                        help="Update schema for repository holdings")
    parser.add_argument("--update_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="Update schema for entity sequence clusters")
    parser.add_argument("--update_data_exchange",
                        default=False,
                        action="store_true",
                        help="Update schema for data exchange status")
    parser.add_argument("--update_ihm_dev",
                        default=False,
                        action="store_true",
                        help="Update schema for I/HM dev entry data")
    parser.add_argument("--update_drugbank_core",
                        default=False,
                        action="store_true",
                        help="Update DrugBank schema")
    #
    parser.add_argument(
        "--update_config_all",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)")
    parser.add_argument(
        "--update_config_deployed",
        default=False,
        action="store_true",
        help=
        "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)")
    parser.add_argument(
        "--update_config_test",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")
    #
    parser.add_argument("--cache_path",
                        default=None,
                        help="Schema cache directory path")
    parser.add_argument(
        "--encoding_types",
        default=None,
        help="Schema encoding (rcsb|json|bson) (comma separated)")
    parser.add_argument(
        "--validation_levels",
        default=None,
        help="Schema validation level (full|min) (comma separated)")
    parser.add_argument("--compare_only",
                        default=False,
                        action="store_true",
                        help="Perform comparison with cached schema")
    #
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument(
        "--mock",
        default=False,
        action="store_true",
        help="Use MOCK repository configuration for dependencies and testing")
    # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    cachePath = args.cache_path
    compareOnly = args.compare_only
    #
    encodingTypes = args.encoding_types.split(
        ",") if args.encoding_types else []
    validationLevels = args.validation_levels.split(
        ",") if args.validation_levels else []
    dataTypingList = ["ANY", "SQL"]

    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)
    #
    databaseNameList = []
    if args.update_chem_comp_ref:
        databaseNameList.append("chem_comp")

    if args.update_bird_chem_comp_ref:
        databaseNameList.append("bird_chem_comp")

    if args.update_chem_comp_core_ref:
        databaseNameList.append("chem_comp_core")

    if args.update_bird_chem_comp_core_ref:
        databaseNameList.append("bird_chem_comp_core")

    if args.update_bird_ref:
        databaseNameList.append("bird")

    if args.update_bird_family_ref:
        databaseNameList.append("bird_family")

    if args.update_pdbx:
        databaseNameList.append("pdbx")

    if args.update_pdbx_core:
        databaseNameList.append("pdbx_core")

    if args.update_pdbx_comp_model_core:
        databaseNameList.append("pdbx_comp_model_core")

    if args.update_repository_holdings:
        databaseNameList.append("repository_holdings")

    if args.update_entity_sequence_clusters:
        databaseNameList.append("sequence_clusters")

    if args.update_data_exchange:
        databaseNameList.append("data_exchange")

    if args.update_ihm_dev:
        databaseNameList.append("ihm_dev")

    if args.update_drugbank_core:
        databaseNameList.append("drugbank_core")

    if args.update_config_deployed:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_DEPLOYED",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_DEPLOYED",
            sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_DEPLOYED",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_DEPLOYED",
            sectionName="database_catalog_configuration")

    if args.update_config_all:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_ALL", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_ALL",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration")

    if args.update_config_test:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_TEST",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_TEST", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
    #
    scnD = cfgOb.get("document_collection_names",
                     sectionName="document_helper_configuration")
    #
    databaseNameList = list(set(databaseNameList))
    logger.debug("Collections %s", list(scnD.items()))
    logger.debug("databaseNameList %s", databaseNameList)

    if compareOnly:
        schP = SchemaProvider(cfgOb, cachePath, useCache=True)
        difPathList = []
        for databaseName in databaseNameList:
            for dataTyping in dataTypingList:
                logger.debug("Building schema %s with types %s", databaseName,
                             dataTyping)
                pth = schP.schemaDefCompare(databaseName, dataTyping)
                if pth:
                    difPathList.append(pth)
        if difPathList:
            logger.info("Schema definition difference path list %r",
                        difPathList)
        difPathList = []
        for databaseName in databaseNameList:
            dD = schP.makeSchemaDef(databaseName,
                                    dataTyping="ANY",
                                    saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in encodingTypes:
                    if encodingType.lower() != "json":
                        continue
                    for level in validationLevels:
                        pth = schP.jsonSchemaCompare(databaseName,
                                                     collectionName,
                                                     encodingType, level)
                        if pth:
                            difPathList.append(pth)
        if difPathList:
            logger.info("JSON schema difference path list %r", difPathList)

    else:
        schP = SchemaProvider(cfgOb, cachePath, useCache=False)
        for databaseName in databaseNameList:
            for encodingType in encodingTypes:
                if encodingType == "rcsb":
                    for dataTyping in dataTypingList:
                        logger.info(
                            "Creating schema definition for content type %s data typing %s",
                            databaseName, dataTyping)
                        schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=True)
                else:
                    if databaseName in scnD:
                        for dD in scnD[databaseName]:
                            collectionName = dD["NAME"]
                            for validationLevel in validationLevels:
                                logger.info(
                                    "Creating %r schema for content type %s collection %s",
                                    encodingType, databaseName, collectionName)
                                schP.makeSchema(databaseName,
                                                collectionName,
                                                encodingType=encodingType,
                                                level=validationLevel,
                                                saveSchema=True)
예제 #9
0
class SchemaDefDataPrepTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefDataPrepTests, self).__init__(methodName)
        self.__loadPathList = []
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def __timeStep(self, msg):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", msg,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testSimpleSchemaDefDataPrep(self):
        for tcD in self.__simpleTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__simpleSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"])

    def testFullSchemaDefDataPrep(self):
        for tcD in self.__fullTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__fullSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"],
                excludeExtras=tcD["excludeExtras"],
            )

    def __simpleSchemaDataPrep(self,
                               contentType,
                               filterType,
                               styleType,
                               mockLength,
                               rejectLength=0,
                               dataSelectors=None,
                               mergeContentTypes=None):
        """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            dataSelectors = dataSelectors if dataSelectors else [
                "PUBLIC_RELEASE"
            ]
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType,
                                                    dataTyping="ANY")
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            #

            logger.debug("For %s mock length %d length of path list %d\n",
                         contentType, mockLength, len(inputPathList))
            self.assertGreaterEqual(len(inputPathList), mockLength)
            tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments(
                inputPathList,
                styleType=styleType,
                filterType=filterType,
                dataSelectors=dataSelectors)
            logger.debug(
                "For %s mock length %d reject length %d length of tddl list %d\n",
                contentType, mockLength, rejectLength, len(tableDataDictList))
            self.assertGreaterEqual(len(tableDataDictList),
                                    mockLength - rejectLength)
            self.assertGreaterEqual(len(containerNameList),
                                    mockLength - rejectLength)

            if rejectList:
                logger.debug("For %s rejecting components %r", contentType,
                             rejectList)
            #
            self.assertEqual(len(rejectList), rejectLength)
            fName = "simple-prep-%s-%s.json" % (contentType, styleType)
            if self.__exportFlag:
                fPath = os.path.join(self.__outputPath, fName)
                self.__mU.doExport(fPath,
                                   tableDataDictList,
                                   fmt="json",
                                   indent=3)
            if self.__diffFlag:
                fPath = os.path.join(self.__savedOutputPath, fName)
                refDocList = self.__mU.doImport(fPath, fmt="json")
                self.assertEqual(len(refDocList), len(tableDataDictList))
                #
                jD = diff(refDocList,
                          tableDataDictList,
                          syntax="explicit",
                          marshal=True)
                if jD:
                    _, fn = os.path.split(fPath)
                    bn, _ = os.path.splitext(fn)
                    fPath = os.path.join(self.__outputPath, bn + "-diff.json")
                    logger.debug("jsondiff for %s %s = \n%s", contentType,
                                 styleType,
                                 pprint.pformat(jD, indent=3, width=100))
                    self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __logDocumentOrder(self, docList):
        for doc in docList:
            logger.debug("keys %r", list(doc.keys()))

    def __filterDocuments(self, docList, excludeList=None):
        excludeList = excludeList if excludeList else []
        for doc in docList:
            for excl in excludeList:
                if excl in doc:
                    del doc[excl]

    def __fullSchemaDataPrep(self,
                             contentType,
                             filterType,
                             styleType,
                             mockLength,
                             rejectLength=0,
                             dataSelectors=None,
                             mergeContentTypes=None,
                             excludeExtras=None):
        """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            excludeExtras = excludeExtras if excludeExtras else []
            _ = mockLength
            _ = rejectLength
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(
                databaseName=contentType, dataTyping="ANY")
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            #
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
            #
            for collectionName in collectionNameList:
                tableIdExcludeList = sd.getCollectionExcluded(collectionName)
                tableIdIncludeList = sd.getCollectionSelected(collectionName)
                sliceFilter = sd.getCollectionSliceFilter(collectionName)
                sdp.setSchemaIdExcludeList(tableIdExcludeList)
                sdp.setSchemaIdIncludeList(tableIdIncludeList)
                #
                docList, _, _ = sdp.processDocuments(
                    containerList,
                    styleType=styleType,
                    sliceFilter=sliceFilter,
                    filterType=filterType,
                    dataSelectors=dataSelectors,
                    collectionName=collectionName)

                docList = sdp.addDocumentPrivateAttributes(
                    docList, collectionName)
                docList = sdp.addDocumentSubCategoryAggregates(
                    docList, collectionName)

                # Special exclusions for the test harness. (removes timestamped data items to allow diffs.)
                self.__filterDocuments(docList, excludeExtras)
                mergeS = "-".join(
                    mergeContentTypes) if mergeContentTypes else ""
                fName = "full-prep-%s-%s-%s-%s.json" % (
                    contentType, collectionName, mergeS, styleType)
                if self.__exportFlag:
                    self.__logDocumentOrder(docList)
                    fPath = os.path.join(self.__outputPath, fName)
                    self.__mU.doExport(fPath, docList, fmt="json", indent=3)
                    logger.debug("Exported %r", fPath)
                #
                if self.__diffFlag:
                    fPath = os.path.join(self.__savedOutputPath, fName)
                    refDocList = self.__mU.doImport(fPath, fmt="json")
                    self.assertEqual(len(refDocList), len(docList))
                    logger.debug("For %s %s len refDocList %d", contentType,
                                 collectionName, len(refDocList))
                    logger.debug("For %s %s len docList %d", contentType,
                                 collectionName, len(docList))
                    jD = diff(refDocList,
                              docList,
                              syntax="explicit",
                              marshal=True)
                    if jD:
                        _, fn = os.path.split(fPath)
                        bn, _ = os.path.splitext(fn)
                        fPath = os.path.join(self.__outputPath,
                                             bn + "-diff.json")
                        logger.debug("jsondiff for %s %s = \n%s", contentType,
                                     collectionName,
                                     pprint.pformat(jD, indent=3, width=100))
                        self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                    self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #10
0
class ChemRefDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateFull(self):
        self.__validateChemRef("DrugBank", schemaLevel="full")

    def __validateChemRef(self, extResource, schemaLevel="full"):
        eCount = 0
        if extResource == "DrugBank":
            schemaName = "drugbank_core"
            collectionNames = ["drugbank_core"]
            user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME",
                                    sectionName=self.__configName)
            pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD",
                                  sectionName=self.__configName)
            # cacheDir = self.__cfgOb.get("DRUGBANK_CACHE_DIR", sectionName=self.__configName)
            dbP = DrugBankProvider(cachePath=self.__cachePath,
                                   useCache=True,
                                   username=user,
                                   password=pw)
            # idD = dbP.getMapping()
            # crExt = ChemRefExtractor(self.__cfgOb)
            # idD = crExt.getChemCompAccesionMapping(extResource)
            dList = dbP.getDocuments()
            logger.info("Validating %d Drugbank documents", len(dList))
            eCount = self.__validate(schemaName,
                                     collectionNames,
                                     dList,
                                     schemaLevel=schemaLevel)

        return eCount

    def __validate(self,
                   databaseName,
                   collectionNames,
                   dList,
                   schemaLevel="full"):

        eCount = 0
        for collectionName in collectionNames:
            _ = self.__schP.makeSchemaDef(databaseName,
                                          dataTyping="ANY",
                                          saveSchema=True)
            cD = self.__schP.makeSchema(databaseName,
                                        collectionName,
                                        encodingType="JSON",
                                        level=schemaLevel,
                                        saveSchema=True)
            # Raises exceptions for schema compliance.
            Draft4Validator.check_schema(cD)
            #
            valInfo = Draft4Validator(cD, format_checker=FormatChecker())
            for ii, dD in enumerate(dList):
                logger.debug("Database %s collection %s document %d",
                             databaseName, collectionName, ii)
                try:
                    cCount = 0
                    for error in sorted(valInfo.iter_errors(dD), key=str):
                        logger.info(
                            "database %s collection %s path %s error: %s",
                            databaseName, collectionName, error.path,
                            error.message)
                        logger.info(">>> failing object is %r", dD)
                        eCount += 1
                        cCount += 1
                    #
                    logger.debug("database %s collection %s count %d",
                                 databaseName, collectionName, cCount)
                except Exception as e:
                    logger.exception("Validation error %s", str(e))

        return eCount
예제 #11
0
class SqlGenTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__sdu = SchemaProvider(self.__cfgOb,
                                    self.__cachePath,
                                    useCache=True)
        #

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def testSQLMethods(self):
        schemaNames = ["pdbx_core"]
        dataTyping = "SQL"
        for schemaName in schemaNames:
            dD = self.__sdu.makeSchemaDef(schemaName,
                                          dataTyping=dataTyping,
                                          saveSchema=False)
            sD = SchemaDefAccess(dD)
            self.__testSchemaCreate(sD)
            self.__testImportExport(sD)
            self.__testSelectionAndConditions(sD)

    #

    def __getHelper(self, modulePath, **kwargs):
        aMod = __import__(modulePath, globals(), locals(), [""])
        sys.modules[modulePath] = aMod
        #
        # Strip off any leading path to the module before we instaniate the object.
        mpL = modulePath.split(".")
        moduleName = mpL[-1]
        #
        aObj = getattr(aMod, moduleName)(**kwargs)
        return aObj

    def __testSchemaCreate(self, sD):
        """Test case -  create table schema using input schema definition as an example
        """

        try:
            tableIdList = sD.getSchemaIdList()
            myAd = SqlGenAdmin(self.__verbose)
            sqlL = []
            for tableId in tableIdList:
                tableDefObj = sD.getSchemaObject(tableId)
                sqlL.extend(
                    myAd.createTableSQL(databaseName=sD.getDatabaseName(),
                                        tableDefObj=tableDefObj))
                logger.debug(
                    "\n\n+SqlGenTests table creation SQL string\n %s\n\n",
                    "\n".join(sqlL))
            self.assertGreaterEqual(len(sqlL), 10)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __testImportExport(self, sD):
        """Test case -  import and export commands --
        """

        try:
            databaseName = sD.getDatabaseName()
            tableIdList = sD.getSchemaIdList()
            myAd = SqlGenAdmin(self.__verbose)
            for tableId in tableIdList:
                tableDefObj = sD.getSchemaObject(tableId)
                exportPath = os.path.join(HERE, "test-output",
                                          tableDefObj.getName() + ".tdd")
                sqlExport = myAd.exportTable(databaseName,
                                             tableDefObj,
                                             exportPath=exportPath)
                logger.debug(
                    "\n\n+SqlGenTests table export SQL string\n %s\n\n",
                    sqlExport)
                sqlImport = myAd.importTable(databaseName,
                                             tableDefObj,
                                             importPath=exportPath)
                logger.debug(
                    "\n\n+SqlGenTests table import SQL string\n %s\n\n",
                    sqlImport)
                self.assertGreaterEqual(len(sqlImport), 100)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __testSelectionAndConditions(self, sD):
        """Test case -  selection everything for a simple condition-
        """
        try:
            # get delete attribute -
            #
            tableIdList = sD.getSchemaIdList()
            logger.debug("TableIdList %r", tableIdList)
            sqlGen = SqlGenQuery(schemaDefObj=sD, verbose=self.__verbose)

            for tableId in tableIdList:
                tableDefObj = sD.getSchemaObject(tableId)
                dAtId = tableDefObj.getDeleteAttributeId()

                if dAtId:
                    sqlCondition = SqlGenCondition(schemaDefObj=sD,
                                                   verbose=self.__verbose)
                    sqlCondition.addValueCondition((tableId, dAtId), "EQ",
                                                   ("D000001", "CHAR"))
                    aIdList = sD.getAttributeIdList(tableId)
                    for aId in aIdList:
                        sqlGen.addSelectAttributeId(attributeTuple=(tableId,
                                                                    aId))
                    sqlGen.setCondition(sqlCondition)
                    sqlGen.addOrderByAttributeId(attributeTuple=(tableId,
                                                                 dAtId))
                    sqlS = sqlGen.getSql()
                    logger.debug(
                        "\n\n+SqlGenTests table creation SQL string\n %s\n\n",
                        sqlS)
                    self.assertGreaterEqual(len(sqlS), 50)
                    sqlGen.clear()
                else:
                    logger.debug("Missing delete atttribe for table %r",
                                 tableId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #12
0
class ObjectValidator(object):
    """Utilities to extract and update object from the document object server with validation."""
    def __init__(self,
                 cfgOb,
                 objectAdapter=None,
                 cachePath=".",
                 useCache=True,
                 **kwargs):
        self.__cfgOb = cfgOb
        self.__oAdapt = objectAdapter
        self.__resourceName = "MONGO_DB"
        _ = kwargs
        self.__statusList = []
        self.__schP = SchemaProvider(self.__cfgOb,
                                     cachePath,
                                     useCache=useCache)
        self.__valInst = None

    def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
        _ = self.__schP.makeSchemaDef(databaseName,
                                      dataTyping="ANY",
                                      saveSchema=True)
        cD = self.__schP.makeSchema(databaseName,
                                    collectionName,
                                    encodingType="JSON",
                                    level=schemaLevel,
                                    saveSchema=True)
        # Raises exceptions for schema compliance.
        Draft4Validator.check_schema(cD)
        valInst = Draft4Validator(cD, format_checker=FormatChecker())
        return valInst

    def __validateObj(self, databaseName, collectionName, rObj, label=""):
        try:
            eCount = 0
            tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
            for error in sorted(self.__valInst.iter_errors(rObj), key=str):
                logger.info(
                    "Database %s collection %s (%s %r) path %s error: %s",
                    databaseName, collectionName, label, tId, error.path,
                    error.message)
                logger.debug(">>> Failing object is %r", rObj)
                eCount += 1
        except Exception as e:
            logger.exception("Validation failing %s", str(e))

        return eCount

    def doTransform(self, **kwargs):
        desp = DataExchangeStatus()
        statusStartTimestamp = desp.setStartTime()
        #
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        fetchLimit = kwargs.get("fetchLimit", None)
        #

        #
        tU = TimeUtil()
        updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
        #
        docSelectList = self.__selectObjectIds(databaseName, collectionName,
                                               selectionQueryD)
        docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList

        ok = self.__transform(databaseName, collectionName, docSelectList)
        #
        if updateId:
            okS = self.__updateStatus(updateId, databaseName, collectionName,
                                      ok, statusStartTimestamp)
        return ok and okS

    def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
        """Return a list of object identifiers for the input selection query."""
        try:

            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    selectL = ["_id"]
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL
        #

    def __transform(self,
                    databaseName,
                    collectionName,
                    docSelectList,
                    logIncrement=100):
        """Return a list of object identifiers for the input selection query."""
        #
        ok = True
        try:
            self.__valInst = self.__getValidator(databaseName,
                                                 collectionName,
                                                 schemaLevel="full")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    numDoc = len(docSelectList)
                    for ii, dD in enumerate(docSelectList, 1):
                        if "_id" not in dD:
                            continue
                        rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                           dD["_id"])
                        del rObj["_id"]
                        #
                        fOk = True

                        if self.__oAdapt:
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Original")
                            fOk, rObj = self.__oAdapt.filter(rObj)
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Updated")
                        if fOk:
                            rOk = mg.replace(databaseName, collectionName,
                                             rObj, dD)
                            if rOk is None:
                                tId = rObj[
                                    "rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
                                logger.error("%r %r (%r) failing",
                                             databaseName, collectionName, tId)
                                # logger.info("rObj.keys() %r", list(rObj.keys()))
                                # logger.info("rObj.items() %s", rObj.items())
                                rOk = False
                            ok = ok and rOk
                        #
                        if ii % logIncrement == 0 or ii == numDoc:
                            logger.info("Replace status %r object (%d of %d)",
                                        ok, ii, numDoc)
                        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def getLoadStatus(self):
        return self.__statusList

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False