class SchemaDefLoaderDbTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefLoaderDbTests, self).__init__(methodName) self.__verbose = True def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__verbose = True # fileLimit = 100 numProc = 2 self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__workPath = os.path.join(HERE, "test-output") mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__resourceName = "MYSQL_DB" # self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=numProc, fileLimit=fileLimit, cachePath=self.__cachePath) # # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __schemaCreate(self, schemaDefObj): """Create table schema using schema definition""" try: tableIdList = schemaDefObj.getSchemaIdList() sqlGen = SqlGenAdmin(self.__verbose) sqlL = sqlGen.createDatabaseSQL(schemaDefObj.getDatabaseName()) for tableId in tableIdList: tableDefObj = schemaDefObj.getSchemaObject(tableId) sqlL.extend( sqlGen.createTableSQL( databaseName=schemaDefObj.getDatabaseName(), tableDefObj=tableDefObj)) logger.debug("Schema creation SQL string\n %s\n\n", "\n".join(sqlL)) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: myQ = MyDbQuery(dbcon=client, verbose=self.__verbose) # # Permit warnings to support "drop table if exists" for missing tables. # myQ.setWarning("ignore") ret = myQ.sqlCommand(sqlCommandList=sqlL) logger.debug("\n\n+INFO mysql server returns %r\n", ret) self.assertTrue(ret) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() # ------------- - ------------------------------------------------------------------------------------------- def testSchemaCreate(self): """Create table schema for BIRD, chemical component, and PDBx data.""" cD = self.__schP.makeSchemaDef("bird", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) # cD = self.__schP.makeSchemaDef("chem_comp", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) # # cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True) # sd = SchemaDefAccess(cD) self.__schemaCreate(sd) def testLoadBirdReference(self): try: cD = self.__schP.makeSchemaDef("bird", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList(contentType="bird") inputPathList.extend( self.__rpP.getLocatorObjList(contentType="bird_family")) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReLoadBirdReference(self): try: cD = self.__schP.makeSchemaDef("bird", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList(contentType="bird") inputPathList.extend( self.__rpP.getLocatorObjList(contentType="bird_family")) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) sdl.load(inputPathList=inputPathList, loadType="batch-file") # logger.debug( "INFO BATCH FILE RELOAD TEST --------------------------------------------\n" ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file", deleteOpt="all") self.assertTrue(ok) # logger.debug( "\n\n\n+INFO BATCH INSERT RELOAD TEST --------------------------------------------\n" ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file", deleteOpt="selected") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadChemCompReference(self): try: cD = self.__schP.makeSchemaDef("chem_comp", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList( contentType="chem_comp") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skip("Disable test - schema not optimized for mysql limitations") def testLoadPdbxFiles(self): try: cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList(contentType="pdbx") logger.debug("Input path list %r", inputPathList) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-insert", deleteOpt="all") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDefLoadercrateDbMultiTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefLoadercrateDbMultiTests, self).__init__(methodName) self.__verbose = True self.__createFlag = True def setUp(self): self.__verbose = True self.__numProc = 2 self.__fileLimit = 100 self.__chunkSize = 0 self.__workPath = os.path.join(HERE, "test-output") self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) self.__resourceName = "CRATE_DB" self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) # # self.__tableIdSkipD = { "ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True, "__LOAD_STATUS__": True } self.__ioObj = IoAdapter(verbose=self.__verbose) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testConnection(self): try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: self.assertNotEqual(client, None) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaCreate(self): """Create table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaRemove(self): """Remove table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadChemCompMulti(self): self.__testLoadFilesMulti("chem_comp") def testLoadBirdMulti(self): self.__testLoadFilesMulti("bird") def testLoadPdbxMulti(self): self.__testLoadFilesMulti("pdbx") def __getPathList(self, fType): pathList = [] if fType == "chem_comp": pathList = self.__rpP.getLocatorObjList("chem_comp") elif fType == "bird": pathList = self.__rpP.getLocatorObjList("bird") pathList.extend(self.__rpP.getLocatorObjList("bird_family")) elif fType == "pdbx": pathList = self.__rpP.getLocatorObjList("pdbx") return pathList def loadInsertMany(self, dataList, procName, optionsD, workingDir): try: _ = workingDir ret = None sd = optionsD["sd"] skipD = optionsD["skip"] ioObj = IoAdapter(verbose=self.__verbose) logger.debug("%s pathlist %r", procName, dataList) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CrateDbLoader(schemaDefObj=sd, ioObj=ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=dataList, loadType="crate-insert-many", deleteOpt="selected", tableIdSkipD=skipD) # all or nothing here if ret: return dataList, dataList, [] else: return [], [], [] except Exception as e: logger.info("Failing with dataList %r", dataList) logger.exception("Failing with %s", str(e)) return [], [], [] def __testLoadFilesMulti(self, contentType): """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)""" numProc = self.__numProc chunkSize = self.__chunkSize try: # sd, _, _, _ = self.__schP.getSchemaInfo(contentType) if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) optD = {} optD["sd"] = sd if contentType == "pdbx": optD["skip"] = self.__tableIdSkipD else: optD["skip"] = {} # pathList = self.__getPathList(fType=contentType) logger.debug("Input path list %r", pathList) mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="loadInsertMany") ok, _, _, _ = mpu.runMulti(dataList=pathList, numProc=numProc, numResults=1, chunkSize=chunkSize) self.assertEqual(ok, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaCreate(self, schemaDefObj): """Test case - create table schema using schema definition""" ret = 0 try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb") sqlL = [] for tableId in tableIdList: if tableId in self.__tableIdSkipD: continue tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend( sqlGen.createTableSQL( databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) logger.debug("Schema creation SQL string\n %s\n\n", "\n".join(sqlL)) logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName()) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) logger.debug("Schema create command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaRemove(self, schemaDefObj): """Test case - remove table schema using schema definition""" ret = 0 try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb") sqlL = [] for tableId in tableIdList: if tableId in self.__tableIdSkipD: continue tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend( sqlGen.dropTableSQL( databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) sqlL.extend( sqlGen.dropTableSQL( databaseName=schemaDefObj.getDatabaseName(), tableDefObj=tableDefObj)) logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL)) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) logger.debug("Schema remove command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__numProc = 2 # self.__fileLimit = None self.__fileLimit = 20 # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName) # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__verbose = False # self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-fails") self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files") self.__export = True # self.__extraOpts = None # The following for extended parent/child info - # self.__extraOpts = 'addParentRefs|addPrimaryKey' # self.__alldatabaseNameD = { "ihm_dev": ["ihm_dev"], "pdbx": ["pdbx", "pdbx_ext"], "pdbx_core": [ "pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_polymer_entity_instance", "pdbx_core_nonpolymer_entity_instance", "pdbx_core_branched_entity_instance", "pdbx_core_polymer_entity_instance", "pdbx_core_nonpolymer_entity_instance", "pdbx_core_branched_entity_instance", ], "bird": ["bird"], "bird_family": ["family"], "chem_comp": ["chem_comp"], "bird_chem_comp": ["bird_chem_comp"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__databaseNameD = { "bird_chem_comp_core": ["bird_chem_comp_core"], "pdbx_core": [ "pdbx_core_polymer_entity_instance", "pdbx_core_polymer_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_nonpolymer_entity", "pdbx_core_branched_entity", "pdbx_core_polymer_entity_instance", "pdbx_core_branched_entity_instance", ], } self.__databaseNameModelD = { "pdbx_comp_model_core": [ "pdbx_comp_model_core_entry", "pdbx_comp_model_core_assembly", "pdbx_comp_model_core_polymer_entity", "pdbx_comp_model_core_polymer_entity_instance", "pdbx_comp_model_core_nonpolymer_entity", "pdbx_comp_model_core_branched_entity", "pdbx_comp_model_core_branched_entity_instance", ], } self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]} # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]} # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]} self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __modelFixture(self): fU = FileUtil() modelSourcePath = os.path.join(self.__mockTopPath, "AF") for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")): fn = os.path.basename(iPath) uId = fn.split("-")[1] h3 = uId[-2:] h2 = uId[-4:-2] h1 = uId[-6:-4] oPath = os.path.join(self.__cachePath, "computed-models", h1, h2, h3, fn) fU.put(iPath, oPath) def testValidateOptsRepo(self): # schemaLevel = "min" schemaLevel = "full" inputPathList = None eCount = self.__testValidateOpts( databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # expected errors # pdbx_core_entry (3JWB) path deque(['reflns_shell', 0, 'Rmerge_I_obs']) error: 33.9 is greater than or equal to the maximum of 10.0 self.assertLessEqual(eCount, 2) def testValidateModels(self): self.__modelFixture() schemaLevel = "full" inputPathList = None eCount = self.__testValidateOpts( databaseNameD=self.__databaseNameModelD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertLessEqual(eCount, 0) @unittest.skip("Disable troubleshooting test") def testValidateOptsList(self): schemaLevel = "min" inputPathList = self.__mU.doImport( os.path.join(HERE, "test-output", "failed-path.list"), "list") # inputPathList = glob.glob(self.__testDirPath + "/*.cif") if not inputPathList: return True databaseNameD = { "pdbx_core": [ "pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation" ] } for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)): if ii < 5: continue eCount = self.__testValidateOpts( databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info( "Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) @unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmRepo(self): schemaLevel = "min" inputPathList = None self.__export = False databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts( databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # @unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmList(self): schemaLevel = "full" inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif") if not inputPathList: return True databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} eCount = self.__testValidateOpts( databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None): # eCount = 0 for databaseName in databaseNameD: mergeContentTypes = mergeContentTypeD[ databaseName] if databaseName in mergeContentTypeD else None _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList( databaseName, mergeContentTypes=mergeContentTypes) for collectionName in databaseNameD[databaseName]: cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=None) # dL, cnL = self.__testPrepDocumentsFromContainers( pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes) # Raises exceptions for schema compliance. try: Draft4Validator.check_schema(cD) except Exception as e: logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e)) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info( "schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message) logger.debug("Failing document %d : %r", ii, list(dD.items())) eCount += 1 cCount += 1 if cCount > 0: logger.info( "schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount) except Exception as e: logger.exception("Validation processing error %s", str(e)) return eCount def __testPrepDocumentsFromContainers( self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None): """Test case - create loadable PDBx data from repository files""" try: sd, _, _, _ = self.__schP.getSchemaInfo(databaseName) # dP = DictionaryApiProviderWrapper(self.__cachePath, cfgOb=self.__cfgOb, configName=self.__configName, useCache=True) dictApi = dP.getApiByName(databaseName) rP = DictMethodResourceProvider( self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") self.__mU.doExport(savePath, [container], fmt="mmcif") # tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # logger.debug("%s (%r) exclude list %r", collectionName, sliceFilter, tableIdExcludeList) logger.debug("%s (%r) include list %r", collectionName, sliceFilter, tableIdIncludeList) docList, containerNameList, _ = sdp.processDocuments( containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName) docList = sdp.addDocumentPrivateAttributes(docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates( docList, collectionName) # mergeS = "-".join(mergeContentTypes) if mergeContentTypes else "" if self.__export and docList: fp = os.path.join( HERE, "test-output", "prep-%s-%s-%s.json" % (databaseName, collectionName, mergeS)) self.__mU.doExport(fp, docList, fmt="json", indent=3) logger.debug("Exported %r", fp) # return docList, containerNameList except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class CockroachDbLoaderCockroachDbTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(CockroachDbLoaderCockroachDbTests, self).__init__(methodName) self.__verbose = True self.__createFlag = False def setUp(self): self.__verbose = True self.__numProc = 2 self.__fileLimit = 100 self.__workPath = os.path.join(HERE, "test-output") self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) self.__resourceName = "COCKROACH_DB" self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) # self.__tableIdSkipD = {"ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True} self.__ioObj = IoAdapter(verbose=self.__verbose) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testConnection(self): try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: self.assertNotEqual(client, None) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaCreate(self): """Create table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaRemove(self): """Remove table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertBirdReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("bird") inputPathList.extend(self.__rpP.getLocatorObjList("bird_family")) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertManyBirdReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("bird") inputPathList.extend(self.__rpP.getLocatorObjList("bird_family")) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertChemCompReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("chem_comp") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertManyChemCompReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("chem_comp") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertPdbxExampleFiles(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("pdbx") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD) self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertManyPdbxExampleFiles(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("pdbx") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD) self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaCreateSQL(self, schemaDefObj): """Test case - create table schema using schema definition""" sqlL = [] try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb") dbName = schemaDefObj.getVersionedDatabaseName() sqlL = sqlGen.createDatabaseSQL(dbName) for tableId in tableIdList: tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return sqlL def __schemaCreate(self, schemaDefObj): """Test case - create table schema using schema definition""" ret = 0 try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb") dbName = schemaDefObj.getVersionedDatabaseName() sqlL = sqlGen.createDatabaseSQL(dbName) for tableId in tableIdList: tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL)) logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName()) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) # ret = crQ.sqlCommand(' '.join(sqlL)) logger.info("Schema create command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaRemove(self, schemaDefObj): """Test case - remove table schema using schema definition""" ret = 0 try: dbName = schemaDefObj.getVersionedDatabaseName() sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb") sqlL = sqlGen.removeDatabaseSQL(dbName) logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL)) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) # ret = crQ.sqlCommand(' '.join(sqlL)) logger.debug("Schema remove command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDefDataPrepTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefDataPrepTests, self).__init__(methodName) self.__loadPathList = [] self.__verbose = True def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__numProc = 2 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__outputPath = os.path.join(HERE, "test-output") self.__savedOutputPath = os.path.join(HERE, "test-saved-output") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE", sectionName=configName, default="local") self.__fileLimit = 100 if self.__discoveryMode == "local" else 10 self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__chemCompMockLen = 24 self.__pdbxMockLen = 30 # removes timestamped data items to allow diffs.) excludeExtras = ["rcsb_load_status"] # excludeExtras = [] # self.__verbose = True self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__exportFlag = True self.__diffFlag = False # self.__simpleTestCaseList = [ { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_no_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeCol, "styleType": "columnwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 0, }, ] # self.__fullTestCaseList = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, { "contentType": "bird_chem_comp_core", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": None, "rejectLength": 2, "excludeExtras": excludeExtras, }, ] # self.__fullTestCaseListA = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __timeStep(self, msg): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", msg, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSimpleSchemaDefDataPrep(self): for tcD in self.__simpleTestCaseList: rejectLength = 0 if self.__discoveryMode == "remote" else tcD[ "rejectLength"] mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[ "mockLength"] if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote": logger.info("Skipping %r in discovery mode %r", tcD["contentType"], self.__discoveryMode) continue self.__simpleSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], mockLength, rejectLength=rejectLength, mergeContentTypes=tcD["mergeContentTypes"]) def testFullSchemaDefDataPrep(self): for tcD in self.__fullTestCaseList: rejectLength = 0 if self.__discoveryMode == "remote" else tcD[ "rejectLength"] mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[ "mockLength"] if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote": logger.info("Skipping %r in discovery mode %r", tcD["contentType"], self.__discoveryMode) continue self.__fullSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], mockLength, rejectLength=rejectLength, mergeContentTypes=tcD["mergeContentTypes"], excludeExtras=tcD["excludeExtras"], ) def __simpleSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None): """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: dataSelectors = dataSelectors if dataSelectors else [ "PUBLIC_RELEASE" ] dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType, dataTyping="ANY") dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) # logger.debug("For %s mock length %d length of path list %d\n", contentType, mockLength, len(inputPathList)) self.assertGreaterEqual(len(inputPathList), mockLength) tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments( inputPathList, styleType=styleType, filterType=filterType, dataSelectors=dataSelectors) logger.debug( "For %s mock length %d reject length %d length of tddl list %d\n", contentType, mockLength, rejectLength, len(tableDataDictList)) self.assertGreaterEqual(len(tableDataDictList), mockLength - rejectLength) self.assertGreaterEqual(len(containerNameList), mockLength - rejectLength) if rejectList: logger.debug("For %s rejecting components %r", contentType, rejectList) # self.assertEqual(len(rejectList), rejectLength) fName = "simple-prep-%s-%s.json" % (contentType, styleType) if self.__exportFlag: fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, tableDataDictList, fmt="json", indent=3) if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(tableDataDictList)) # jD = diff(refDocList, tableDataDictList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, styleType, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __logDocumentOrder(self, docList): for doc in docList: logger.debug("keys %r", list(doc.keys())) def __filterDocuments(self, docList, excludeList=None): excludeList = excludeList if excludeList else [] for doc in docList: for excl in excludeList: if excl in doc: del doc[excl] def __fullSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None, excludeExtras=None): """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: excludeExtras = excludeExtras if excludeExtras else [] _ = mockLength _ = rejectLength dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, collectionNameList, _ = self.__schP.getSchemaInfo( databaseName=contentType, dataTyping="ANY") # dP = DictionaryApiProviderWrapper(self.__cachePath, cfgOb=self.__cfgOb, configName=self.__configName, useCache=True) dictApi = dP.getApiByName(contentType) # rP = DictMethodResourceProvider( self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) # for collectionName in collectionNameList: tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, _, _ = sdp.processDocuments( containerList, styleType=styleType, sliceFilter=sliceFilter, filterType=filterType, dataSelectors=dataSelectors, collectionName=collectionName) docList = sdp.addDocumentPrivateAttributes( docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates( docList, collectionName) # Special exclusions for the test harness. (removes timestamped data items to allow diffs.) self.__filterDocuments(docList, excludeExtras) mergeS = "-".join( mergeContentTypes) if mergeContentTypes else "" fName = "full-prep-%s-%s-%s-%s.json" % ( contentType, collectionName, mergeS, styleType) if self.__exportFlag: self.__logDocumentOrder(docList) fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, docList, fmt="json", indent=3) logger.debug("Exported %r", fPath) # if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(docList)) logger.debug("For %s %s len refDocList %d", contentType, collectionName, len(refDocList)) logger.debug("For %s %s len docList %d", contentType, collectionName, len(docList)) jD = diff(refDocList, docList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, collectionName, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class RepositoryProviderTests(unittest.TestCase): def setUp(self): # # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") self.__configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath) self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__numProc = 2 self.__chunkSize = 20 self.__fileLimit = None # self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testRepoUtils(self): """Test case - repository locator path utilities""" for contentType in ["bird_chem_comp_core", "pdbx_core", "ihm_dev"]: mergeContentTypes = None if contentType in ["pdbx_core"]: mergeContentTypes = ["vrpt"] # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) pathList = self.__rpP.getLocatorPaths(locatorObjList) locatorObjList2 = self.__rpP.getLocatorsFromPaths( locatorObjList, pathList) logger.info("%s pathList length %d", contentType, len(pathList)) self.assertEqual(len(locatorObjList), len(pathList)) self.assertEqual(len(locatorObjList), len(locatorObjList2)) # for contentType in ["bird_chem_comp_core", "pdbx_core", "ihm_dev"]: mergeContentTypes = None if contentType in ["pdbx_core"]: mergeContentTypes = ["vrpt"] # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) pathList = self.__rpP.getLocatorPaths(locatorObjList) self.assertEqual(len(locatorObjList), len(pathList)) # lCount = len(pathList) idCodes = self.__rpP.getLocatorIdcodes(contentType, locatorObjList) self.assertEqual(len(locatorObjList), len(idCodes)) excludeList = idCodes[:int(len(idCodes) / 2)] logger.debug("excludeList (%d) %r", len(excludeList), excludeList) fL = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes, excludeIds=excludeList) logger.debug("fL (%d)", len(fL)) self.assertEqual(lCount, len(fL) + len(excludeList))
class DictMethodRunnerTests(unittest.TestCase): def setUp(self): self.__export = True self.__numProc = 2 self.__fileLimit = 200 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__testCaseList = [ { "contentType": "pdbx_core", "mockLength": 50, "mergeContent": ["vrpt"] }, { "contentType": "bird_chem_comp_core", "mockLength": 17, "mergeContent": None }, ] # self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __runContentType(self, contentType, mockLength, mergeContent): """Read and process test fixture data files from the input content type.""" try: dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName(contentType) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContent) containerList = self.__rpP.getContainerList(locatorObjList) # logger.debug("Length of locator list %d\n", len(locatorObjList)) self.assertGreaterEqual(len(locatorObjList), mockLength) for container in containerList: cName = container.getName() # # if cName not in ["1B5F"]: # continue logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") self.__mU.doExport(savePath, [container], fmt="mmcif") except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMethodRunner(self): """Test method runner for multiple content types.""" for tD in self.__testCaseList: self.__runContentType(tD["contentType"], tD["mockLength"], tD["mergeContent"]) def testMethodRunnerSetup(self): """Test the setup methods for method runner class""" try: dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName("pdbx") rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) ok = dmh is not None self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class NeighborInteractionProvider(object): """Generators and accessors for non-polymer instance target interactions.""" def __init__(self, cfgOb, configName, cachePath, **kwargs): # self.__version = __version__ self.__cfgOb = cfgOb self.__configName = configName self.__cachePath = cachePath self.__fileLimit = kwargs.get("fileLimit", None) self.__dirPath = os.path.join(cachePath, "neighbor-interactions") self.__numProc = kwargs.get("numProc", 2) self.__chunkSize = kwargs.get("chunkSize", 10) useCache = kwargs.get("useCache", True) # # - Configuration for stash services - # Local target directory name to be stashed. (subdir of dirPath) # self.__stashDir = "ligand-target-neighbors" # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) self.__neighborD = self.__reload(fmt="pickle", useCache=useCache) # def testCache(self, minCount=0): try: if minCount == 0: return True if self.__neighborD and minCount and len( self.__neighborD["entries"]) >= minCount: logger.info( "Target neighbor data for (%d) entries created %r version %r", len(self.__neighborD["entries"]), self.__neighborD["created"], self.__neighborD["version"]) return True except Exception: pass return False def getLigandNeighborIndex(self, entryId): """Return the target neighbors for the non-polymer instances for the input entry. Args: entryId (str): entry identifier Returns: (dict): {ligandAsymId: {(targetAsymId, targetAuthSeqId): nnIndex1, (): nnIndex2} """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandNeighborIndexD"] except Exception: pass return {} def getTargetNeighborIndex(self, entryId): """Return the ligand neighbors for the polymer or branched entity instances in the input entry. Args: entryId (str): entry identifier Returns: (dict): {(targetAsymId, targetAuthSeqId): {(ligandAsymId): nnIndex1, (): nnIndex2} """ try: return self.__neighborD["entries"][ entryId.upper()]["targetNeighborIndexD"] except Exception: pass return {} def getNearestNeighborList(self, entryId): """Return the list of neares neighbors for the entry. Args: entryId (str): entry identifier Returns: list: [LigandTargetInstance(), ...] """ try: return self.__neighborD["entries"][ entryId.upper()]["nearestNeighbors"] except Exception: pass return [] def getLigandNeighborBoundState(self, entryId): """Return the dicitonary of ligand instances with isBound boolean status. Args: entryId (str): entry identifier Returns: (dict): {ligandAsymId: True if isBound, ... } """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandIsBoundD"] except Exception: pass return {} def getAtomCounts(self, entryId): """Return the non-polymer instance atom counts for the input entry (all reported atoms). Args: entryId (str): entry identifier Returns: (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }} """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandAtomCountD"] except Exception: pass return {} def getHydrogenAtomCounts(self, entryId): """Return the non-polymer instance hydrogen atom counts for the input entry. Args: entryId (str): entry identifier Returns: (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }} """ try: return self.__neighborD["entries"][ entryId.upper()]["ligandHydrogenAtomCountD"] except Exception: pass return {} def hasEntry(self, entryId): """Return if the input entry is stored in the cache of non-polymer instance target interactions. Args: entryId (str): entry identifier Returns: (bool): True if entry is in the cache or False otherwise """ try: return entryId in self.__neighborD["entries"] except Exception: pass return False def getEntries(self): """Return a list of entry identifier for which non-polymer instance target interactions are stored. Returns: (list): [entryId, entryId, ... ] """ try: return list(self.__neighborD["entries"].keys()) except Exception: pass return [] def generate(self, distLimit=5.0, updateOnly=False, fmt="pickle", indent=0): """Generate and export non-polymer target interactions for all of the structures in the repository. Args: distLimit (float, optional): interaction distance. Defaults to 5.0. updateOnly (bool): only calculate interactions for new entries. Defaults to False. fmt (str, optional): export file format. Defaults to "pickle". indent (int, optional): json format indent. Defaults to 0. Returns: bool: True for success or False otherwise """ ok = False try: tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) tD = self.__calculateNeighbors(distLimit=distLimit, numProc=self.__numProc, chunkSize=self.__chunkSize, updateOnly=updateOnly) self.__neighborD = { "version": self.__version, "created": tS, "entries": tD } kwargs = { "indent": indent } if fmt == "json" else { "pickleProtocol": 4 } targetFilePath = self.__getTargetFilePath(fmt=fmt) ok = self.__mU.doExport(targetFilePath, self.__neighborD, fmt=fmt, **kwargs) logger.info("Wrote %r status %r", targetFilePath, ok) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def reload(self, fmt="pickle"): self.__neighborD = self.__reload(fmt=fmt, useCache=True) return self.__neighborD is not None def __reload(self, fmt="pickle", useCache=True): """Reload from the current cache file.""" try: targetFilePath = self.__getTargetFilePath(fmt=fmt) tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) neighborD = { "version": self.__version, "created": tS, "entries": {} } logger.debug("useCache %r targetFilePath %r", useCache, targetFilePath) # if useCache and self.__mU.exists(targetFilePath): neighborD = self.__mU.doImport(targetFilePath, fmt=fmt) if fmt != "pickle": for _, nD in neighborD["entries"].items(): nD["nearestNeighbors"] = [ LigandTargetInstance(*neighbor) for neighbor in nD["nearestNeighbors"] ] except Exception as e: logger.exception("Failing with %s", str(e)) # return neighborD def __getTargetFilePath(self, fmt="pickle"): ext = "pic" if fmt == "pickle" else "json" pth = os.path.join(self.__dirPath, "ligand-target-neighbors", "neighbor-data." + ext) return pth def __calculateNeighbors(self, distLimit=5.0, numProc=2, chunkSize=10, updateOnly=False): """Calculate non-polymer target interactions for all repository structure files. Args: distLimit (float, optional): interaction distance limit. Defaults to 5.0. numProc (int, optional): number of processes to use. Defaults to 2. chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10. Returns: (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...} """ contentType = "pdbx" mergeContent = None rD = {} exD = {} # # updateOnly - will reuse any existing data loaded when this is instantiated # otherwise the cache context is cleared before the calculation. if updateOnly: exD = {k: True for k in self.getEntries()} rD = self.__neighborD[ "entries"] if "entries" in self.__neighborD else {} # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContent, excludeIds=exD) logger.info("Starting with %d numProc %d updateOnly (%r)", len(locatorObjList), self.__numProc, updateOnly) # rWorker = TargetInteractionWorker(self.__rpP) mpu = MultiProcUtil(verbose=True) optD = {"distLimit": distLimit} mpu.setOptions(optD) mpu.set(workerObj=rWorker, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList, numProc=numProc, numResults=1, chunkSize=chunkSize) if failList: logger.info("Target interaction build failures (%d): %r", len(failList), failList) # for (entryId, nD) in resultList[0]: rD[entryId] = nD # logger.info( "Completed with multi-proc status %r failures %r total entries with data (%d)", ok, len(failList), len(rD)) return rD def toStash(self): ok = False try: userName = self.__cfgOb.get("_STASH_AUTH_USERNAME", sectionName=self.__configName) password = self.__cfgOb.get("_STASH_AUTH_PASSWORD", sectionName=self.__configName) basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH", sectionName=self.__configName) url = self.__cfgOb.get("STASH_SERVER_URL", sectionName=self.__configName) urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL", sectionName=self.__configName) ok = self.__toStash(url, basePath, userName=userName, password=password) ok = self.__toStash(urlFallBack, basePath, userName=userName, password=password) except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __toStash(self, url, stashRemoteDirPath, userName=None, password=None, remoteStashPrefix=None): """Copy tar and gzipped bundled cache data to remote server/location. Args: url (str): server URL (e.g. sftp://hostname.domain) None for local host stashRemoteDirPath (str): path to target directory on remote server userName (str, optional): server username. Defaults to None. password (str, optional): server password. Defaults to None. remoteStashPrefix (str, optional): channel prefix. Defaults to None. Returns: (bool): True for success or False otherwise """ ok = False try: stU = StashUtil(os.path.join(self.__dirPath, "stash"), "ligand-target-neighbors") ok = stU.makeBundle(self.__dirPath, [self.__stashDir]) if ok: ok = stU.storeBundle(url, stashRemoteDirPath, remoteStashPrefix=remoteStashPrefix, userName=userName, password=password) except Exception as e: logger.error("Failing with url %r stashDirPath %r: %s", url, stashRemoteDirPath, str(e)) return ok def fromStash(self): try: minCount = 10 userName = self.__cfgOb.get("_STASH_AUTH_USERNAME", sectionName=self.__configName) password = self.__cfgOb.get("_STASH_AUTH_PASSWORD", sectionName=self.__configName) basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH", sectionName=self.__configName) url = self.__cfgOb.get("STASH_SERVER_URL", sectionName=self.__configName) # ok = self.__fromStash(url, basePath, userName=userName, password=password) ok = self.reload() ok = self.testCache(minCount=minCount) if not ok: urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL", sectionName=self.__configName) ok = self.__fromStash(urlFallBack, basePath, userName=userName, password=password) ok = self.testCache(minCount=minCount) ok = self.reload() except Exception as e: logger.exception("Failing with %s", str(e)) return ok def __fromStash(self, url, stashRemoteDirPath, userName=None, password=None, remoteStashPrefix=None): """Restore local cache from a tar and gzipped bundle to fetched from a remote server/location. Args: url (str): server URL (e.g. sftp://hostname.domain) None for local host stashRemoteDirPath (str): path to target directory on remote server userName (str, optional): server username. Defaults to None. password (str, optional): server password. Defaults to None. remoteStashPrefix (str, optional): channel prefix. Defaults to None. Returns: (bool): True for success or False otherwise """ ok = False try: stU = StashUtil(os.path.join(self.__dirPath, "stash"), "ligand-target-neighbors") ok = stU.fetchBundle(self.__dirPath, url, stashRemoteDirPath, remoteStashPrefix=remoteStashPrefix, userName=userName, password=password) except Exception as e: logger.error("Failing with url %r stashDirPath %r: %s", url, stashRemoteDirPath, str(e)) return ok def convert(self, fmt1="json", fmt2="pickle"): # targetFilePath = self.__getTargetFilePath(fmt=fmt1) self.__neighborD = self.__mU.doImport(targetFilePath, fmt=fmt1) # targetFilePath = self.__getTargetFilePath(fmt=fmt2) ok = self.__mU.doExport(targetFilePath, self.__neighborD, fmt=fmt2, pickleProtocol=4) return ok
class ScanRepoUtil(object): """Tools for for scanning repositories and collecting coverage and type data information.""" def __init__(self, cfgOb, attributeDataTypeD=None, numProc=4, chunkSize=15, fileLimit=None, maxStepLength=2000, workPath=None): """ Args: cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil) attributeDataTypeD dictPath (str): Path to supporting data dictionary numProc (int, optional): Number of parallel worker processes used. chunkSize (int, optional): Size of files processed in a single multi-proc process fileLimit (int, optional): maximum file scanned or None for no limit mockTopPath (str, optional): Path to directory containing mock repositories or None maxStepLength (int, optional): maximum number of multi-proc runs to perform """ # self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {} # Limit the load length of each file type for testing - Set to None to remove - self.__fileLimit = fileLimit self.__maxStepLength = maxStepLength # # Controls for multiprocessing execution - self.__numProc = numProc self.__chunkSize = chunkSize # self.__cfgOb = cfgOb # self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" self.__workPath = workPath self.__mU = MarshalUtil(workPath=self.__workPath) self.__rpP = RepositoryProvider(self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) def scanContentType(self, contentType, mergeContentTypes=None, scanType="full", inputPathList=None, scanDataFilePath=None, failedFilePath=None, saveInputFileListPath=None): """Driver method for repository scan operation Args: contentType (str): one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx' scanType (str, optional): 'full' [or 'incr' to be supported] inputPathList (list, optional): list of input file paths to scan scanDataFilePath (str, optional): file path for serialized scan data (Pickle format) failedFilePath (str, optional): file path for list of files that fail scanning operation saveInputFileListPath str, optional): Path to store file path list that is scanned Returns: bool: True for success or False otherwise """ try: startTime = self.__begin(message="scanning operation") # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, inputPathList=inputPathList, mergeContentTypes=mergeContentTypes) # if saveInputFileListPath: self.__mU.doExport(saveInputFileListPath, self.__rpP.getLocatorPaths(locatorObjList), fmt="list") logger.debug("Saving %d paths in %s", len(locatorObjList), saveInputFileListPath) # optD = {} optD["contentType"] = contentType optD["logSize"] = True optD["scanType"] = scanType # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # numProc = self.__numProc chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len( locatorObjList) else 0 # # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numPaths = len(locatorObjList) logger.debug("Processing %d total paths", numPaths) numProc = min(numProc, numPaths) maxStepLength = self.__maxStepLength if numPaths > maxStepLength: numLists = int(numPaths / maxStepLength) subLists = [ locatorObjList[i::numLists] for i in range(numLists) ] else: subLists = [locatorObjList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # numResults = 1 failList = [] retLists = [[] for ii in range(numResults)] diagList = [] for ii, subList in enumerate(subLists): logger.info("Running outer subtask %d of %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="scanWorker") ok, failListT, retListsT, diagListT = mpu.runMulti( dataList=subList, numProc=numProc, numResults=numResults, chunkSize=chunkSize) failList.extend(failListT) # retLists is a list of lists - logger.debug("status %r fail len %r ret len %r", ok, len(failListT), len(retListsT)) for jj in range(numResults): retLists[jj].extend(retListsT[jj]) diagList.extend(diagListT) logger.debug("Scan failed path list %r", failList) logger.debug( "Scan path list success length %d load list failed length %d", len(locatorObjList), len(failList)) logger.debug("Returned metadata length %r", len(retLists[0])) # if failedFilePath and failList: wOk = self.__mU.doExport(failedFilePath, self.__rpP.getLocatorPaths(failList), fmt="list") logger.debug("Writing scan failure path list to %s status %r", failedFilePath, wOk) # if scanType == "incr": scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle", default=None) logger.debug("Imported scan data with keys %r", list(scanDataD.keys())) else: scanDataD = {} # if scanDataFilePath and retLists[0]: for ssTup in retLists[0]: cId = ssTup.containerId if scanType == "full" and cId in scanDataD: logger.error("Duplicate container id %s in %r and %r", cId, ssTup.fromPath, scanDataD[cId].fromPath) # scanDataD[cId] = ssTup ok = self.__mU.doExport(scanDataFilePath, scanDataD, fmt="pickle") tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") ok = tscanDataD == scanDataD self.__end(startTime, "scanning operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def evalScan(self, scanDataFilePath, evalJsonFilePath, evalType="data_type"): scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") if evalType in ["data_type"]: rD = self.__evalScanDataType(scanDataD) elif evalType in ["data_coverage"]: rD, _ = self.__evalScanDataCoverage(scanDataD) else: logger.debug("Unknown evalType %r", evalType) ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json") return ok def evalScanItem(self, scanDataFilePath, evalFilePath): scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") _, cL = self.__evalScanDataCoverage(scanDataD) ok = self.__mU.doExport(evalFilePath, cL, fmt="list") return ok def __evalScanDataType(self, scanDataD): """ ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec') ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict') """ # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count} sD = {} for cId in scanDataD: ssTup = scanDataD[cId] dD = ssTup.scanCategoryDict for catName in dD: if catName not in sD: sD[catName] = {} for svTup in dD[catName]: if svTup.atName not in sD[catName]: sD[catName][svTup.atName] = { "minWidth": svTup.minWidth, "maxWidth": svTup.maxWidth, "minPrec": svTup.minPrec, "maxPrec": svTup.maxPrec, "count": 1 } continue sD[catName][svTup.atName]["minWidth"] = min( sD[catName][svTup.atName]["minWidth"], svTup.minWidth) sD[catName][svTup.atName]["maxWidth"] = max( sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth) sD[catName][svTup.atName]["minPrec"] = min( sD[catName][svTup.atName]["minPrec"], svTup.minPrec) sD[catName][svTup.atName]["maxPrec"] = max( sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec) sD[catName][svTup.atName]["count"] += 1 return sD def __evalScanDataCoverage(self, scanDataD): """ ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec') ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict') """ # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]} sD = {} for cId in scanDataD: ssTup = scanDataD[cId] dD = ssTup.scanCategoryDict for catName in dD: if catName not in sD: sD[catName] = {} for svTup in dD[catName]: if svTup.atName not in sD[catName]: sD[catName][svTup.atName] = { "count": 0, "instances": [] } sD[catName][svTup.atName]["instances"].append( svTup.containerId) sD[catName][svTup.atName]["count"] += 1 cL = [] for catName, aD in sD.items(): for atName, tD in aD.items(): cL.append("%s\t%s" % ("_" + catName + "." + atName, tD["count"])) return sD, cL def scanWorker(self, dataList, procName, optionsD, workingDir): """Multi-proc worker method for scanning repository data files-""" try: _ = workingDir startTime = self.__begin(message=procName) # Recover common options scanType = optionsD["scanType"] contentType = optionsD["contentType"] # successList = [] retList = [] containerList = self.__getContainerList(dataList) for container in containerList: ret = self.__scanContainer(container) successList.append(ret.fromPath) retList.append(ret) # logger.debug( "%s scanType %s contentType %spathlist length %d containerList length %d", procName, scanType, contentType, len(dataList), len(containerList)) ok = len(successList) == len(dataList) # self.__end(startTime, procName + " with status " + str(ok)) return successList, retList, [] except Exception as e: logger.error("Failing with dataList %r", dataList) logger.exception("Failing with %s", str(e)) return [], [], [] def __getContainerList(self, locatorObjList): """""" utcnow = datetime.datetime.utcnow() ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S") cL = [] myContainerList = self.__rpP.getContainerList(locatorObjList) for loc in locatorObjList: myContainerList = self.__rpP.getContainerList([loc]) lPathL = self.__rpP.getLocatorPaths([loc]) for cA in myContainerList: dc = DataCategory("rcsb_load_status", ["name", "load_date", "locator"], [[cA.getName(), ts, lPathL[0]]]) logger.debug("data category %r", dc) cA.append(dc) cL.append(cA) return cL def __scanContainer(self, container): """Scan the input container for Get the file name - """ cName = container.getName() loadStatusObj = container.getObj("rcsb_load_status") lName = loadStatusObj.getValue(attributeName="name", rowIndex=0) lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0) lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0) # oD = {} for objName in container.getObjNameList(): if objName == "rcsb_load_status": continue obj = container.getObj(objName) afD = self.__attributeDataTypeD[ objName] if objName in self.__attributeDataTypeD else {} atNameList = obj.getAttributeList() wMin = {atName: 100000 for atName in atNameList} wMax = {atName: -1 for atName in atNameList} pMin = {atName: 100000 for atName in atNameList} pMax = {atName: -1 for atName in atNameList} for row in obj.getRowList(): for ii, val in enumerate(row): valLen = len(val) if (valLen == 0) or (val == "?") or (val == "."): continue atName = atNameList[ii] wMin[atName] = min(wMin[atName], valLen) wMax[atName] = max(wMax[atName], valLen) if atName in afD and afD[atName] == "float": vPrec = 0 try: fields = val.split(".") vPrec = len(fields[1]) pMin[atName] = min(pMin[atName], vPrec) pMax[atName] = max(pMax[atName], vPrec) except Exception as e: logger.debug("Failed to process float %s %r %r %s", atName, val, vPrec, str(e)) pMin[atName] = 0 pMax[atName] = 0 logger.debug("Got float for %s %r %r", atName, val, vPrec) else: pMin[atName] = 0 pMax[atName] = 0 # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec oD[objName] = [ ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN], pMax[atN]) for atN in wMax if wMax[atN] != -1 ] # ScanSummary containerId, fromPath, scanCategoryDict # ret = ScanSummary(lName, lFilePath, lDate, oD) # return ret def __begin(self, message=""): startTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting %s at %s", message, ts) return startTime def __end(self, startTime, message=""): endTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) delta = endTime - startTime logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)