示例#1
0
class SchemaDefLoaderDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoaderDbTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__verbose = True
        #
        fileLimit = 100
        numProc = 2
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__resourceName = "MYSQL_DB"
        #
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=numProc,
                                        fileLimit=fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __schemaCreate(self, schemaDefObj):
        """Create table schema using schema definition"""
        try:
            tableIdList = schemaDefObj.getSchemaIdList()
            sqlGen = SqlGenAdmin(self.__verbose)
            sqlL = sqlGen.createDatabaseSQL(schemaDefObj.getDatabaseName())
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getSchemaObject(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                myQ = MyDbQuery(dbcon=client, verbose=self.__verbose)
                #
                # Permit warnings to support "drop table if exists" for missing tables.
                #
                myQ.setWarning("ignore")
                ret = myQ.sqlCommand(sqlCommandList=sqlL)
                logger.debug("\n\n+INFO mysql server returns %r\n", ret)
                self.assertTrue(ret)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    # ------------- - -------------------------------------------------------------------------------------------
    def testSchemaCreate(self):
        """Create table schema for BIRD, chemical component, and PDBx data."""
        cD = self.__schP.makeSchemaDef("bird",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        cD = self.__schP.makeSchemaDef("chem_comp",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        # cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True)
        # sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)

    def testLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                sdl.load(inputPathList=inputPathList, loadType="batch-file")
                #
                logger.debug(
                    "INFO BATCH FILE RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="all")
                self.assertTrue(ok)
                #
                logger.debug(
                    "\n\n\n+INFO BATCH INSERT RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="selected")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompReference(self):
        try:
            cD = self.__schP.makeSchemaDef("chem_comp",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(
                contentType="chem_comp")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("Disable test - schema not optimized for mysql limitations")
    def testLoadPdbxFiles(self):
        try:
            cD = self.__schP.makeSchemaDef("pdbx",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="pdbx")
            logger.debug("Input path list %r", inputPathList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-insert",
                              deleteOpt="all")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
示例#2
0
class SchemaDefLoadercrateDbMultiTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoadercrateDbMultiTests, self).__init__(methodName)
        self.__verbose = True
        self.__createFlag = True

    def setUp(self):
        self.__verbose = True
        self.__numProc = 2
        self.__fileLimit = 100
        self.__chunkSize = 0
        self.__workPath = os.path.join(HERE, "test-output")
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName)
        self.__resourceName = "CRATE_DB"
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__workPath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)
        #
        #
        self.__tableIdSkipD = {
            "ATOM_SITE": True,
            "ATOM_SITE_ANISOTROP": True,
            "__LOAD_STATUS__": True
        }
        self.__ioObj = IoAdapter(verbose=self.__verbose)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testConnection(self):
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                self.assertNotEqual(client, None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaCreate(self):
        """Create table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaRemove(self):
        """Remove table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompMulti(self):
        self.__testLoadFilesMulti("chem_comp")

    def testLoadBirdMulti(self):
        self.__testLoadFilesMulti("bird")

    def testLoadPdbxMulti(self):
        self.__testLoadFilesMulti("pdbx")

    def __getPathList(self, fType):
        pathList = []
        if fType == "chem_comp":
            pathList = self.__rpP.getLocatorObjList("chem_comp")
        elif fType == "bird":
            pathList = self.__rpP.getLocatorObjList("bird")
            pathList.extend(self.__rpP.getLocatorObjList("bird_family"))
        elif fType == "pdbx":
            pathList = self.__rpP.getLocatorObjList("pdbx")
        return pathList

    def loadInsertMany(self, dataList, procName, optionsD, workingDir):

        try:
            _ = workingDir
            ret = None
            sd = optionsD["sd"]
            skipD = optionsD["skip"]
            ioObj = IoAdapter(verbose=self.__verbose)
            logger.debug("%s pathlist %r", procName, dataList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = CrateDbLoader(schemaDefObj=sd,
                                    ioObj=ioObj,
                                    dbCon=client,
                                    workPath=self.__workPath,
                                    cleanUp=False,
                                    warnings="default",
                                    verbose=self.__verbose)
                ret = sdl.load(inputPathList=dataList,
                               loadType="crate-insert-many",
                               deleteOpt="selected",
                               tableIdSkipD=skipD)
            # all or nothing here
            if ret:
                return dataList, dataList, []
            else:
                return [], [], []
        except Exception as e:
            logger.info("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __testLoadFilesMulti(self, contentType):
        """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)"""
        numProc = self.__numProc
        chunkSize = self.__chunkSize
        try:
            #
            sd, _, _, _ = self.__schP.getSchemaInfo(contentType)
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)

            optD = {}
            optD["sd"] = sd
            if contentType == "pdbx":
                optD["skip"] = self.__tableIdSkipD
            else:
                optD["skip"] = {}

            #
            pathList = self.__getPathList(fType=contentType)
            logger.debug("Input path list %r", pathList)
            mpu = MultiProcUtil(verbose=True)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="loadInsertMany")
            ok, _, _, _ = mpu.runMulti(dataList=pathList,
                                       numProc=numProc,
                                       numResults=1,
                                       chunkSize=chunkSize)
            self.assertEqual(ok, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaCreate(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb")
            sqlL = []
            for tableId in tableIdList:
                if tableId in self.__tableIdSkipD:
                    continue
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getVersionedDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            logger.info("Creating schema using database %s",
                        schemaDefObj.getVersionedDatabaseName())
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                logger.debug("Schema create command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaRemove(self, schemaDefObj):
        """Test case -  remove table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb")
            sqlL = []
            for tableId in tableIdList:
                if tableId in self.__tableIdSkipD:
                    continue
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(
                    sqlGen.dropTableSQL(
                        databaseName=schemaDefObj.getVersionedDatabaseName(),
                        tableDefObj=tableDefObj))
                sqlL.extend(
                    sqlGen.dropTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                logger.debug("Schema remove command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
示例#3
0
class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        # self.__fileLimit = None
        self.__fileLimit = 20

        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH",
                                                   sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = False
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-fails")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        self.__extraOpts = None
        # The following for extended parent/child info -
        # self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": [
                "pdbx_core_entity",
                "pdbx_core_entry",
                "pdbx_core_assembly",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_nonpolymer_entity_instance",
                "pdbx_core_branched_entity_instance",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_nonpolymer_entity_instance",
                "pdbx_core_branched_entity_instance",
            ],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "bird_chem_comp_core": ["bird_chem_comp_core"],
            "pdbx_core": [
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_polymer_entity",
                "pdbx_core_entry",
                "pdbx_core_assembly",
                "pdbx_core_nonpolymer_entity",
                "pdbx_core_branched_entity",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_branched_entity_instance",
            ],
        }
        self.__databaseNameModelD = {
            "pdbx_comp_model_core": [
                "pdbx_comp_model_core_entry",
                "pdbx_comp_model_core_assembly",
                "pdbx_comp_model_core_polymer_entity",
                "pdbx_comp_model_core_polymer_entity_instance",
                "pdbx_comp_model_core_nonpolymer_entity",
                "pdbx_comp_model_core_branched_entity",
                "pdbx_comp_model_core_branched_entity_instance",
            ],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __modelFixture(self):
        fU = FileUtil()
        modelSourcePath = os.path.join(self.__mockTopPath, "AF")
        for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")):
            fn = os.path.basename(iPath)
            uId = fn.split("-")[1]
            h3 = uId[-2:]
            h2 = uId[-4:-2]
            h1 = uId[-6:-4]
            oPath = os.path.join(self.__cachePath, "computed-models", h1, h2,
                                 h3, fn)
            fU.put(iPath, oPath)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"
        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(
            databaseNameD=self.__databaseNameD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        # expected errors
        # pdbx_core_entry (3JWB) path deque(['reflns_shell', 0, 'Rmerge_I_obs']) error: 33.9 is greater than or equal to the maximum of 10.0
        self.assertLessEqual(eCount, 2)

    def testValidateModels(self):
        self.__modelFixture()
        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(
            databaseNameD=self.__databaseNameModelD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertLessEqual(eCount, 0)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(
            os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {
            "pdbx_core": [
                "pdbx_core_entity", "pdbx_core_entry",
                "pdbx_core_entity_instance",
                "pdbx_core_entity_instance_validation"
            ]
        }
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(
                databaseNameD=databaseNameD,
                inputPathList=subList,
                schemaLevel=schemaLevel,
                mergeContentTypeD=self.__mergeContentTypeD)
            logger.info(
                "Chunk %d total validation errors schema level %s : %d", ii,
                schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    @unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = False

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(
            databaseNameD=databaseNameD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    @unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        schemaLevel = "full"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        eCount = self.__testValidateOpts(
            databaseNameD=databaseNameD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self,
                           databaseNameD,
                           inputPathList=None,
                           schemaLevel="full",
                           mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[
                databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName,
                                          dataTyping="ANY",
                                          saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(
                databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName,
                                            collectionName,
                                            encodingType="JSON",
                                            level=schemaLevel,
                                            saveSchema=True,
                                            extraOpts=None)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList,
                    databaseName,
                    collectionName,
                    styleType="rowwise_by_name_with_cardinality",
                    mergeContentTypes=mergeContentTypes)
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s",
                                 databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL),
                            databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d",
                                 databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info(
                                "schema %s collection %s (%s) path %s error: %s",
                                databaseName, collectionName, cnL[ii],
                                error.path, error.message)
                            logger.debug("Failing document %d : %r", ii,
                                         list(dD.items()))
                            eCount += 1
                            cCount += 1
                        if cCount > 0:
                            logger.info(
                                "schema %s collection %s container %s error count %d",
                                databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s",
                                         str(e))

        return eCount

    def __testPrepDocumentsFromContainers(
            self,
            inputPathList,
            databaseName,
            collectionName,
            styleType="rowwise_by_name_with_cardinality",
            mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files"""
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            logger.debug("%s (%r) exclude list %r", collectionName,
                         sliceFilter, tableIdExcludeList)
            logger.debug("%s (%r) include list %r", collectionName,
                         sliceFilter, tableIdIncludeList)
            docList, containerNameList, _ = sdp.processDocuments(
                containerList,
                styleType=styleType,
                filterType=self.__fTypeRow,
                dataSelectors=["PUBLIC_RELEASE"],
                sliceFilter=sliceFilter,
                collectionName=collectionName)

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(
                docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                fp = os.path.join(
                    HERE, "test-output", "prep-%s-%s-%s.json" %
                    (databaseName, collectionName, mergeS))
                self.__mU.doExport(fp, docList, fmt="json", indent=3)
                logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
示例#4
0
class CockroachDbLoaderCockroachDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(CockroachDbLoaderCockroachDbTests, self).__init__(methodName)
        self.__verbose = True
        self.__createFlag = False

    def setUp(self):
        self.__verbose = True
        self.__numProc = 2
        self.__fileLimit = 100
        self.__workPath = os.path.join(HERE, "test-output")
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName)
        self.__resourceName = "COCKROACH_DB"
        self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath)
        #
        self.__tableIdSkipD = {"ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True}
        self.__ioObj = IoAdapter(verbose=self.__verbose)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testConnection(self):
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                self.assertNotEqual(client, None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaCreate(self):
        """Create table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaRemove(self):
        """Remove table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertBirdReference(self):

        try:

            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("bird")
            inputPathList.extend(self.__rpP.getLocatorObjList("bird_family"))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyBirdReference(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("bird")
            inputPathList.extend(self.__rpP.getLocatorObjList("bird_family"))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertChemCompReference(self):

        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("chem_comp")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyChemCompReference(self):

        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("chem_comp")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertPdbxExampleFiles(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("pdbx")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD)
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyPdbxExampleFiles(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("pdbx")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD)
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaCreateSQL(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        sqlL = []
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlL = sqlGen.createDatabaseSQL(dbName)
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj))
            logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return sqlL

    def __schemaCreate(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlL = sqlGen.createDatabaseSQL(dbName)
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj))

            logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL))
            logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName())
            #
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                # ret = crQ.sqlCommand(' '.join(sqlL))
                logger.info("Schema create command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaRemove(self, schemaDefObj):
        """Test case -  remove table schema using schema definition"""
        ret = 0
        try:
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            sqlL = sqlGen.removeDatabaseSQL(dbName)
            logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                # ret = crQ.sqlCommand(' '.join(sqlL))
                logger.debug("Schema remove command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
示例#5
0
class SchemaDefDataPrepTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefDataPrepTests, self).__init__(methodName)
        self.__loadPathList = []
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def __timeStep(self, msg):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", msg,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testSimpleSchemaDefDataPrep(self):
        for tcD in self.__simpleTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__simpleSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"])

    def testFullSchemaDefDataPrep(self):
        for tcD in self.__fullTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__fullSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"],
                excludeExtras=tcD["excludeExtras"],
            )

    def __simpleSchemaDataPrep(self,
                               contentType,
                               filterType,
                               styleType,
                               mockLength,
                               rejectLength=0,
                               dataSelectors=None,
                               mergeContentTypes=None):
        """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            dataSelectors = dataSelectors if dataSelectors else [
                "PUBLIC_RELEASE"
            ]
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType,
                                                    dataTyping="ANY")
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            #

            logger.debug("For %s mock length %d length of path list %d\n",
                         contentType, mockLength, len(inputPathList))
            self.assertGreaterEqual(len(inputPathList), mockLength)
            tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments(
                inputPathList,
                styleType=styleType,
                filterType=filterType,
                dataSelectors=dataSelectors)
            logger.debug(
                "For %s mock length %d reject length %d length of tddl list %d\n",
                contentType, mockLength, rejectLength, len(tableDataDictList))
            self.assertGreaterEqual(len(tableDataDictList),
                                    mockLength - rejectLength)
            self.assertGreaterEqual(len(containerNameList),
                                    mockLength - rejectLength)

            if rejectList:
                logger.debug("For %s rejecting components %r", contentType,
                             rejectList)
            #
            self.assertEqual(len(rejectList), rejectLength)
            fName = "simple-prep-%s-%s.json" % (contentType, styleType)
            if self.__exportFlag:
                fPath = os.path.join(self.__outputPath, fName)
                self.__mU.doExport(fPath,
                                   tableDataDictList,
                                   fmt="json",
                                   indent=3)
            if self.__diffFlag:
                fPath = os.path.join(self.__savedOutputPath, fName)
                refDocList = self.__mU.doImport(fPath, fmt="json")
                self.assertEqual(len(refDocList), len(tableDataDictList))
                #
                jD = diff(refDocList,
                          tableDataDictList,
                          syntax="explicit",
                          marshal=True)
                if jD:
                    _, fn = os.path.split(fPath)
                    bn, _ = os.path.splitext(fn)
                    fPath = os.path.join(self.__outputPath, bn + "-diff.json")
                    logger.debug("jsondiff for %s %s = \n%s", contentType,
                                 styleType,
                                 pprint.pformat(jD, indent=3, width=100))
                    self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __logDocumentOrder(self, docList):
        for doc in docList:
            logger.debug("keys %r", list(doc.keys()))

    def __filterDocuments(self, docList, excludeList=None):
        excludeList = excludeList if excludeList else []
        for doc in docList:
            for excl in excludeList:
                if excl in doc:
                    del doc[excl]

    def __fullSchemaDataPrep(self,
                             contentType,
                             filterType,
                             styleType,
                             mockLength,
                             rejectLength=0,
                             dataSelectors=None,
                             mergeContentTypes=None,
                             excludeExtras=None):
        """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            excludeExtras = excludeExtras if excludeExtras else []
            _ = mockLength
            _ = rejectLength
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(
                databaseName=contentType, dataTyping="ANY")
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            #
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
            #
            for collectionName in collectionNameList:
                tableIdExcludeList = sd.getCollectionExcluded(collectionName)
                tableIdIncludeList = sd.getCollectionSelected(collectionName)
                sliceFilter = sd.getCollectionSliceFilter(collectionName)
                sdp.setSchemaIdExcludeList(tableIdExcludeList)
                sdp.setSchemaIdIncludeList(tableIdIncludeList)
                #
                docList, _, _ = sdp.processDocuments(
                    containerList,
                    styleType=styleType,
                    sliceFilter=sliceFilter,
                    filterType=filterType,
                    dataSelectors=dataSelectors,
                    collectionName=collectionName)

                docList = sdp.addDocumentPrivateAttributes(
                    docList, collectionName)
                docList = sdp.addDocumentSubCategoryAggregates(
                    docList, collectionName)

                # Special exclusions for the test harness. (removes timestamped data items to allow diffs.)
                self.__filterDocuments(docList, excludeExtras)
                mergeS = "-".join(
                    mergeContentTypes) if mergeContentTypes else ""
                fName = "full-prep-%s-%s-%s-%s.json" % (
                    contentType, collectionName, mergeS, styleType)
                if self.__exportFlag:
                    self.__logDocumentOrder(docList)
                    fPath = os.path.join(self.__outputPath, fName)
                    self.__mU.doExport(fPath, docList, fmt="json", indent=3)
                    logger.debug("Exported %r", fPath)
                #
                if self.__diffFlag:
                    fPath = os.path.join(self.__savedOutputPath, fName)
                    refDocList = self.__mU.doImport(fPath, fmt="json")
                    self.assertEqual(len(refDocList), len(docList))
                    logger.debug("For %s %s len refDocList %d", contentType,
                                 collectionName, len(refDocList))
                    logger.debug("For %s %s len docList %d", contentType,
                                 collectionName, len(docList))
                    jD = diff(refDocList,
                              docList,
                              syntax="explicit",
                              marshal=True)
                    if jD:
                        _, fn = os.path.split(fPath)
                        bn, _ = os.path.splitext(fn)
                        fPath = os.path.join(self.__outputPath,
                                             bn + "-diff.json")
                        logger.debug("jsondiff for %s %s = \n%s", contentType,
                                     collectionName,
                                     pprint.pformat(jD, indent=3, width=100))
                        self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                    self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class RepositoryProviderTests(unittest.TestCase):
    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=mockTopPath)
        self.__cachePath = os.path.join(TOPDIR, "CACHE")

        self.__numProc = 2
        self.__chunkSize = 20
        self.__fileLimit = None
        #
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testRepoUtils(self):
        """Test case - repository locator path utilities"""
        for contentType in ["bird_chem_comp_core", "pdbx_core", "ihm_dev"]:
            mergeContentTypes = None
            if contentType in ["pdbx_core"]:
                mergeContentTypes = ["vrpt"]
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            pathList = self.__rpP.getLocatorPaths(locatorObjList)
            locatorObjList2 = self.__rpP.getLocatorsFromPaths(
                locatorObjList, pathList)
            logger.info("%s pathList length %d", contentType, len(pathList))
            self.assertEqual(len(locatorObjList), len(pathList))
            self.assertEqual(len(locatorObjList), len(locatorObjList2))
            #
        for contentType in ["bird_chem_comp_core", "pdbx_core", "ihm_dev"]:
            mergeContentTypes = None
            if contentType in ["pdbx_core"]:
                mergeContentTypes = ["vrpt"]
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            pathList = self.__rpP.getLocatorPaths(locatorObjList)
            self.assertEqual(len(locatorObjList), len(pathList))
            #
            lCount = len(pathList)
            idCodes = self.__rpP.getLocatorIdcodes(contentType, locatorObjList)
            self.assertEqual(len(locatorObjList), len(idCodes))
            excludeList = idCodes[:int(len(idCodes) / 2)]
            logger.debug("excludeList (%d) %r", len(excludeList), excludeList)
            fL = self.__rpP.getLocatorObjList(
                contentType=contentType,
                mergeContentTypes=mergeContentTypes,
                excludeIds=excludeList)
            logger.debug("fL (%d)", len(fL))
            self.assertEqual(lCount, len(fL) + len(excludeList))
class DictMethodRunnerTests(unittest.TestCase):
    def setUp(self):
        self.__export = True
        self.__numProc = 2
        self.__fileLimit = 200
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__testCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": 50,
                "mergeContent": ["vrpt"]
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": 17,
                "mergeContent": None
            },
        ]
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __runContentType(self, contentType, mockLength, mergeContent):
        """Read and process test fixture data files from the input content type."""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContent)
            containerList = self.__rpP.getContainerList(locatorObjList)
            #
            logger.debug("Length of locator list %d\n", len(locatorObjList))
            self.assertGreaterEqual(len(locatorObjList), mockLength)
            for container in containerList:
                cName = container.getName()
                #
                # if cName not in ["1B5F"]:
                #    continue
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMethodRunner(self):
        """Test method runner for multiple content types."""
        for tD in self.__testCaseList:
            self.__runContentType(tD["contentType"], tD["mockLength"],
                                  tD["mergeContent"])

    def testMethodRunnerSetup(self):
        """Test the setup methods for method runner class"""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName("pdbx")
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            ok = dmh is not None
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
示例#8
0
class NeighborInteractionProvider(object):
    """Generators and accessors for non-polymer instance target interactions."""
    def __init__(self, cfgOb, configName, cachePath, **kwargs):
        #
        self.__version = __version__
        self.__cfgOb = cfgOb
        self.__configName = configName
        self.__cachePath = cachePath
        self.__fileLimit = kwargs.get("fileLimit", None)
        self.__dirPath = os.path.join(cachePath, "neighbor-interactions")
        self.__numProc = kwargs.get("numProc", 2)
        self.__chunkSize = kwargs.get("chunkSize", 10)
        useCache = kwargs.get("useCache", True)
        #
        #  - Configuration for stash services -
        #    Local target directory name to be stashed.  (subdir of dirPath)
        #
        self.__stashDir = "ligand-target-neighbors"
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        self.__neighborD = self.__reload(fmt="pickle", useCache=useCache)
        #

    def testCache(self, minCount=0):
        try:
            if minCount == 0:
                return True
            if self.__neighborD and minCount and len(
                    self.__neighborD["entries"]) >= minCount:
                logger.info(
                    "Target neighbor data for (%d) entries created %r version %r",
                    len(self.__neighborD["entries"]),
                    self.__neighborD["created"], self.__neighborD["version"])
                return True
        except Exception:
            pass
        return False

    def getLigandNeighborIndex(self, entryId):
        """Return the target neighbors for the non-polymer instances for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: {(targetAsymId, targetAuthSeqId): nnIndex1, (): nnIndex2}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getTargetNeighborIndex(self, entryId):
        """Return the ligand neighbors for the polymer or branched entity instances in the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {(targetAsymId, targetAuthSeqId): {(ligandAsymId): nnIndex1, (): nnIndex2}

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["targetNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getNearestNeighborList(self, entryId):
        """Return the list of neares neighbors for the entry.

        Args:
            entryId (str): entry identifier

        Returns:
            list: [LigandTargetInstance(), ...]

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["nearestNeighbors"]
        except Exception:
            pass
        return []

    def getLigandNeighborBoundState(self, entryId):
        """Return the dicitonary of ligand instances with isBound boolean status.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: True if isBound,  ...  }
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandIsBoundD"]
        except Exception:
            pass
        return {}

    def getAtomCounts(self, entryId):
        """Return the non-polymer instance atom counts for the input entry (all reported atoms).

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandAtomCountD"]
        except Exception:
            pass
        return {}

    def getHydrogenAtomCounts(self, entryId):
        """Return the non-polymer instance hydrogen atom counts for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandHydrogenAtomCountD"]
        except Exception:
            pass
        return {}

    def hasEntry(self, entryId):
        """Return if the input entry is stored in the cache of non-polymer instance target interactions.

        Args:
            entryId (str): entry identifier

        Returns:
            (bool): True if entry is in the cache or False otherwise
        """
        try:
            return entryId in self.__neighborD["entries"]
        except Exception:
            pass
        return False

    def getEntries(self):
        """Return a list of entry identifier for which non-polymer instance target interactions are stored.

        Returns:
            (list): [entryId, entryId, ... ]
        """
        try:
            return list(self.__neighborD["entries"].keys())
        except Exception:
            pass
        return []

    def generate(self,
                 distLimit=5.0,
                 updateOnly=False,
                 fmt="pickle",
                 indent=0):
        """Generate and export non-polymer target interactions for all of the structures in the repository.

        Args:
            distLimit (float, optional): interaction distance. Defaults to 5.0.
            updateOnly (bool):  only calculate interactions for new entries.  Defaults to False.
            fmt (str, optional): export file format. Defaults to "pickle".
            indent (int, optional): json format indent. Defaults to 0.

        Returns:
            bool: True for success or False otherwise
        """
        ok = False
        try:
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            tD = self.__calculateNeighbors(distLimit=distLimit,
                                           numProc=self.__numProc,
                                           chunkSize=self.__chunkSize,
                                           updateOnly=updateOnly)
            self.__neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": tD
            }
            kwargs = {
                "indent": indent
            } if fmt == "json" else {
                "pickleProtocol": 4
            }
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            ok = self.__mU.doExport(targetFilePath,
                                    self.__neighborD,
                                    fmt=fmt,
                                    **kwargs)
            logger.info("Wrote %r status %r", targetFilePath, ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reload(self, fmt="pickle"):
        self.__neighborD = self.__reload(fmt=fmt, useCache=True)
        return self.__neighborD is not None

    def __reload(self, fmt="pickle", useCache=True):
        """Reload from the current cache file."""
        try:
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": {}
            }
            logger.debug("useCache %r targetFilePath %r", useCache,
                         targetFilePath)
            #
            if useCache and self.__mU.exists(targetFilePath):
                neighborD = self.__mU.doImport(targetFilePath, fmt=fmt)
                if fmt != "pickle":
                    for _, nD in neighborD["entries"].items():
                        nD["nearestNeighbors"] = [
                            LigandTargetInstance(*neighbor)
                            for neighbor in nD["nearestNeighbors"]
                        ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return neighborD

    def __getTargetFilePath(self, fmt="pickle"):
        ext = "pic" if fmt == "pickle" else "json"
        pth = os.path.join(self.__dirPath, "ligand-target-neighbors",
                           "neighbor-data." + ext)
        return pth

    def __calculateNeighbors(self,
                             distLimit=5.0,
                             numProc=2,
                             chunkSize=10,
                             updateOnly=False):
        """Calculate non-polymer target interactions for all repository structure files.

        Args:
            distLimit (float, optional): interaction distance limit. Defaults to 5.0.
            numProc (int, optional): number of processes to use. Defaults to 2.
            chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10.

        Returns:
            (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...}
        """
        contentType = "pdbx"
        mergeContent = None
        rD = {}
        exD = {}
        #
        # updateOnly - will reuse any existing data loaded when this is instantiated
        #              otherwise the cache context is cleared before the calculation.
        if updateOnly:
            exD = {k: True for k in self.getEntries()}
            rD = self.__neighborD[
                "entries"] if "entries" in self.__neighborD else {}
        #
        locatorObjList = self.__rpP.getLocatorObjList(
            contentType=contentType,
            mergeContentTypes=mergeContent,
            excludeIds=exD)
        logger.info("Starting with %d numProc %d updateOnly (%r)",
                    len(locatorObjList), self.__numProc, updateOnly)
        #
        rWorker = TargetInteractionWorker(self.__rpP)
        mpu = MultiProcUtil(verbose=True)
        optD = {"distLimit": distLimit}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="build")
        ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        if failList:
            logger.info("Target interaction build failures (%d): %r",
                        len(failList), failList)
        #
        for (entryId, nD) in resultList[0]:
            rD[entryId] = nD
        #
        logger.info(
            "Completed with multi-proc status %r failures %r total entries with data (%d)",
            ok, len(failList), len(rD))
        return rD

    def toStash(self):
        ok = False
        try:
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                           sectionName=self.__configName)
            ok = self.__toStash(url,
                                basePath,
                                userName=userName,
                                password=password)
            ok = self.__toStash(urlFallBack,
                                basePath,
                                userName=userName,
                                password=password)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __toStash(self,
                  url,
                  stashRemoteDirPath,
                  userName=None,
                  password=None,
                  remoteStashPrefix=None):
        """Copy tar and gzipped bundled cache data to remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.makeBundle(self.__dirPath, [self.__stashDir])
            if ok:
                ok = stU.storeBundle(url,
                                     stashRemoteDirPath,
                                     remoteStashPrefix=remoteStashPrefix,
                                     userName=userName,
                                     password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def fromStash(self):
        try:
            minCount = 10
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            #
            ok = self.__fromStash(url,
                                  basePath,
                                  userName=userName,
                                  password=password)
            ok = self.reload()
            ok = self.testCache(minCount=minCount)
            if not ok:
                urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                               sectionName=self.__configName)
                ok = self.__fromStash(urlFallBack,
                                      basePath,
                                      userName=userName,
                                      password=password)
                ok = self.testCache(minCount=minCount)
                ok = self.reload()
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return ok

    def __fromStash(self,
                    url,
                    stashRemoteDirPath,
                    userName=None,
                    password=None,
                    remoteStashPrefix=None):
        """Restore local cache from a tar and gzipped bundle to fetched from a remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.fetchBundle(self.__dirPath,
                                 url,
                                 stashRemoteDirPath,
                                 remoteStashPrefix=remoteStashPrefix,
                                 userName=userName,
                                 password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def convert(self, fmt1="json", fmt2="pickle"):
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt1)
        self.__neighborD = self.__mU.doImport(targetFilePath, fmt=fmt1)
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt2)
        ok = self.__mU.doExport(targetFilePath,
                                self.__neighborD,
                                fmt=fmt2,
                                pickleProtocol=4)
        return ok
class ScanRepoUtil(object):
    """Tools for for scanning repositories and collecting coverage and type data information."""
    def __init__(self,
                 cfgOb,
                 attributeDataTypeD=None,
                 numProc=4,
                 chunkSize=15,
                 fileLimit=None,
                 maxStepLength=2000,
                 workPath=None):
        """
        Args:
            cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil)

            attributeDataTypeD
            dictPath (str): Path to supporting data dictionary

            numProc (int, optional): Number of parallel worker processes used.
            chunkSize (int, optional): Size of files processed in a single multi-proc process
            fileLimit (int, optional): maximum file scanned or None for no limit
            mockTopPath (str, optional): Path to directory containing mock repositories or None
            maxStepLength (int, optional): maximum number of multi-proc runs to perform
        """
        #
        self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {}
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__fileLimit = fileLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

        self.__workPath = workPath
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__rpP = RepositoryProvider(self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)

    def scanContentType(self,
                        contentType,
                        mergeContentTypes=None,
                        scanType="full",
                        inputPathList=None,
                        scanDataFilePath=None,
                        failedFilePath=None,
                        saveInputFileListPath=None):
        """Driver method for repository scan operation

        Args:
            contentType (str):  one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx'
            scanType (str, optional): 'full' [or 'incr' to be supported]
            inputPathList (list, optional):  list of input file paths to scan
            scanDataFilePath (str, optional): file path for serialized scan data (Pickle format)
            failedFilePath (str, optional): file path for list of files that fail scanning operation
            saveInputFileListPath str, optional): Path to store file path list that is scanned

        Returns:
            bool: True for success or False otherwise

        """
        try:
            startTime = self.__begin(message="scanning operation")
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
            #
            if saveInputFileListPath:
                self.__mU.doExport(saveInputFileListPath,
                                   self.__rpP.getLocatorPaths(locatorObjList),
                                   fmt="list")
                logger.debug("Saving %d paths in %s", len(locatorObjList),
                             saveInputFileListPath)
            #
            optD = {}
            optD["contentType"] = contentType
            optD["logSize"] = True
            optD["scanType"] = scanType
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            numProc = self.__numProc
            chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len(
                locatorObjList) else 0
            #
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numPaths = len(locatorObjList)
            logger.debug("Processing %d total paths", numPaths)
            numProc = min(numProc, numPaths)
            maxStepLength = self.__maxStepLength
            if numPaths > maxStepLength:
                numLists = int(numPaths / maxStepLength)
                subLists = [
                    locatorObjList[i::numLists] for i in range(numLists)
                ]
            else:
                subLists = [locatorObjList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            numResults = 1
            failList = []
            retLists = [[] for ii in range(numResults)]
            diagList = []
            for ii, subList in enumerate(subLists):
                logger.info("Running outer subtask %d of %d length %d", ii + 1,
                            len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="scanWorker")
                ok, failListT, retListsT, diagListT = mpu.runMulti(
                    dataList=subList,
                    numProc=numProc,
                    numResults=numResults,
                    chunkSize=chunkSize)
                failList.extend(failListT)
                # retLists is a list of lists -
                logger.debug("status %r fail len %r ret len %r", ok,
                             len(failListT), len(retListsT))
                for jj in range(numResults):
                    retLists[jj].extend(retListsT[jj])
                diagList.extend(diagListT)
            logger.debug("Scan failed path list %r", failList)
            logger.debug(
                "Scan path list success length %d load list failed length %d",
                len(locatorObjList), len(failList))
            logger.debug("Returned metadata length %r", len(retLists[0]))
            #
            if failedFilePath and failList:
                wOk = self.__mU.doExport(failedFilePath,
                                         self.__rpP.getLocatorPaths(failList),
                                         fmt="list")
                logger.debug("Writing scan failure path list to %s status %r",
                             failedFilePath, wOk)
            #
            if scanType == "incr":
                scanDataD = self.__mU.doImport(scanDataFilePath,
                                               fmt="pickle",
                                               default=None)
                logger.debug("Imported scan data with keys %r",
                             list(scanDataD.keys()))
            else:
                scanDataD = {}
            #
            if scanDataFilePath and retLists[0]:
                for ssTup in retLists[0]:
                    cId = ssTup.containerId
                    if scanType == "full" and cId in scanDataD:
                        logger.error("Duplicate container id %s in %r and %r",
                                     cId, ssTup.fromPath,
                                     scanDataD[cId].fromPath)
                    #
                    scanDataD[cId] = ssTup

                ok = self.__mU.doExport(scanDataFilePath,
                                        scanDataD,
                                        fmt="pickle")
                tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
                ok = tscanDataD == scanDataD

            self.__end(startTime, "scanning operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False

    def evalScan(self,
                 scanDataFilePath,
                 evalJsonFilePath,
                 evalType="data_type"):

        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        if evalType in ["data_type"]:
            rD = self.__evalScanDataType(scanDataD)
        elif evalType in ["data_coverage"]:
            rD, _ = self.__evalScanDataCoverage(scanDataD)
        else:
            logger.debug("Unknown evalType %r", evalType)
        ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json")

        return ok

    def evalScanItem(self, scanDataFilePath, evalFilePath):
        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        _, cL = self.__evalScanDataCoverage(scanDataD)
        ok = self.__mU.doExport(evalFilePath, cL, fmt="list")
        return ok

    def __evalScanDataType(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """
        # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "minWidth": svTup.minWidth,
                            "maxWidth": svTup.maxWidth,
                            "minPrec": svTup.minPrec,
                            "maxPrec": svTup.maxPrec,
                            "count": 1
                        }
                        continue
                    sD[catName][svTup.atName]["minWidth"] = min(
                        sD[catName][svTup.atName]["minWidth"], svTup.minWidth)
                    sD[catName][svTup.atName]["maxWidth"] = max(
                        sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth)
                    sD[catName][svTup.atName]["minPrec"] = min(
                        sD[catName][svTup.atName]["minPrec"], svTup.minPrec)
                    sD[catName][svTup.atName]["maxPrec"] = max(
                        sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec)
                    sD[catName][svTup.atName]["count"] += 1
        return sD

    def __evalScanDataCoverage(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """

        # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "count": 0,
                            "instances": []
                        }
                    sD[catName][svTup.atName]["instances"].append(
                        svTup.containerId)
                    sD[catName][svTup.atName]["count"] += 1
        cL = []
        for catName, aD in sD.items():
            for atName, tD in aD.items():
                cL.append("%s\t%s" %
                          ("_" + catName + "." + atName, tD["count"]))
        return sD, cL

    def scanWorker(self, dataList, procName, optionsD, workingDir):
        """Multi-proc worker method for scanning repository data files-"""
        try:
            _ = workingDir
            startTime = self.__begin(message=procName)
            # Recover common options

            scanType = optionsD["scanType"]
            contentType = optionsD["contentType"]
            #
            successList = []
            retList = []

            containerList = self.__getContainerList(dataList)
            for container in containerList:
                ret = self.__scanContainer(container)
                successList.append(ret.fromPath)
                retList.append(ret)
            #

            logger.debug(
                "%s scanType %s contentType %spathlist length %d containerList length %d",
                procName, scanType, contentType, len(dataList),
                len(containerList))

            ok = len(successList) == len(dataList)
            #
            self.__end(startTime, procName + " with status " + str(ok))
            return successList, retList, []

        except Exception as e:
            logger.error("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __getContainerList(self, locatorObjList):
        """"""
        utcnow = datetime.datetime.utcnow()
        ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S")
        cL = []
        myContainerList = self.__rpP.getContainerList(locatorObjList)
        for loc in locatorObjList:
            myContainerList = self.__rpP.getContainerList([loc])
            lPathL = self.__rpP.getLocatorPaths([loc])
            for cA in myContainerList:
                dc = DataCategory("rcsb_load_status",
                                  ["name", "load_date", "locator"],
                                  [[cA.getName(), ts, lPathL[0]]])
                logger.debug("data category %r", dc)
                cA.append(dc)
                cL.append(cA)
        return cL

    def __scanContainer(self, container):
        """Scan the input container for

        Get the file name -
        """
        cName = container.getName()
        loadStatusObj = container.getObj("rcsb_load_status")
        lName = loadStatusObj.getValue(attributeName="name", rowIndex=0)
        lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0)
        lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0)
        #
        oD = {}
        for objName in container.getObjNameList():
            if objName == "rcsb_load_status":
                continue
            obj = container.getObj(objName)
            afD = self.__attributeDataTypeD[
                objName] if objName in self.__attributeDataTypeD else {}
            atNameList = obj.getAttributeList()
            wMin = {atName: 100000 for atName in atNameList}
            wMax = {atName: -1 for atName in atNameList}
            pMin = {atName: 100000 for atName in atNameList}
            pMax = {atName: -1 for atName in atNameList}
            for row in obj.getRowList():
                for ii, val in enumerate(row):
                    valLen = len(val)
                    if (valLen == 0) or (val == "?") or (val == "."):
                        continue
                    atName = atNameList[ii]
                    wMin[atName] = min(wMin[atName], valLen)
                    wMax[atName] = max(wMax[atName], valLen)
                    if atName in afD and afD[atName] == "float":
                        vPrec = 0
                        try:
                            fields = val.split(".")
                            vPrec = len(fields[1])
                            pMin[atName] = min(pMin[atName], vPrec)
                            pMax[atName] = max(pMax[atName], vPrec)
                        except Exception as e:
                            logger.debug("Failed to process float %s %r %r %s",
                                         atName, val, vPrec, str(e))
                            pMin[atName] = 0
                            pMax[atName] = 0
                        logger.debug("Got float for %s %r %r", atName, val,
                                     vPrec)
                    else:
                        pMin[atName] = 0
                        pMax[atName] = 0

            # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec
            oD[objName] = [
                ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN],
                          pMax[atN]) for atN in wMax if wMax[atN] != -1
            ]
        # ScanSummary containerId, fromPath, scanCategoryDict
        #
        ret = ScanSummary(lName, lFilePath, lDate, oD)
        #
        return ret

    def __begin(self, message=""):
        startTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting %s at %s", message, ts)
        return startTime

    def __end(self, startTime, message=""):
        endTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        delta = endTime - startTime
        logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)