def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" self.__export = False # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__mU = MarshalUtil(workPath=self.__cachePath) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__verbose = True # fileLimit = 100 numProc = 2 self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__workPath = os.path.join(HERE, "test-output") mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__resourceName = "MYSQL_DB" # self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=numProc, fileLimit=fileLimit, cachePath=self.__cachePath) # # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False): self.__cfgOb = cfgOb self.__cachePath = cachePath self.__useCache = useCache self.__readBackCheck = readBackCheck self.__numProc = numProc self.__chunkSize = chunkSize self.__documentLimit = documentLimit # self.__resourceName = "MONGO_DB" self.__verbose = verbose self.__statusList = [] self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache) self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb) self.__valInst = None self.__doValidate = doValidate
def setUp(self): self.__verbose = True self.__numProc = 2 self.__fileLimit = 100 self.__chunkSize = 0 self.__workPath = os.path.join(HERE, "test-output") self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) self.__resourceName = "CRATE_DB" self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) # # self.__tableIdSkipD = { "ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True, "__LOAD_STATUS__": True } self.__ioObj = IoAdapter(verbose=self.__verbose) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def __init__( self, cfgOb, cachePath, resourceName="MONGO_DB", numProc=4, chunkSize=15, documentLimit=None, verbose=False, readBackCheck=False, maxStepLength=2000, schemaRebuildFlag=False, ): self.__verbose = verbose # # Limit the load length of each file type for testing - Set to None to remove - self.__documentLimit = documentLimit self.__maxStepLength = maxStepLength # # Controls for multiprocessing execution - self.__numProc = numProc self.__chunkSize = chunkSize # self.__cfgOb = cfgOb self.__resourceName = resourceName # self.__cachePath = cachePath if cachePath else "." self.__schP = SchemaProvider(cfgOb, cachePath, useCache=True, rebuildFlag=schemaRebuildFlag) # self.__readBackCheck = readBackCheck self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"
def setUp(self): self.__numProc = 2 # self.__fileLimit = 200 self.__fileLimit = None self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName) # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__verbose = True # self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files") self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files") self.__export = True # #self.__extraOpts = None # The following for extended parent/child info - self.__extraOpts = 'addParentRefs|addPrimaryKey' # self.__alldatabaseNameD = { "ihm_dev": ["ihm_dev"], "pdbx": ["pdbx", "pdbx_ext"], "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird": ["bird"], "bird_family": ["family"], "chem_comp": ["chem_comp"], "bird_chem_comp": ["bird_chem_comp"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__databaseNameD = { "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]} # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]} # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]} self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def __init__(self, cfgOb, objectAdapter=None, cachePath=".", useCache=True, **kwargs): self.__cfgOb = cfgOb self.__oAdapt = objectAdapter self.__resourceName = "MONGO_DB" _ = kwargs self.__statusList = [] self.__schP = SchemaProvider(self.__cfgOb, cachePath, useCache=useCache) self.__valInst = None
def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False) # self.__validationLevels = self.__cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") self.__encodingTypes = self.__cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # buildAll = True if buildAll: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") # else: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") # self.__databaseNameList = ["repository_holdings"] self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") # self.__databaseNameList = ["sequence_clusters"] self.__saveSchema = True self.__compareDefSchema = False self.__compareSchema = False # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True, clearPath=False) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__dataSetId = "2018_23" self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) # self.__levels = ['100', '95', '90', '70', '50', '30'] self.__levels = ["100"] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class SchemaDefLoaderDbTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefLoaderDbTests, self).__init__(methodName) self.__verbose = True def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__verbose = True # fileLimit = 100 numProc = 2 self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__workPath = os.path.join(HERE, "test-output") mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__resourceName = "MYSQL_DB" # self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=numProc, fileLimit=fileLimit, cachePath=self.__cachePath) # # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __schemaCreate(self, schemaDefObj): """Create table schema using schema definition""" try: tableIdList = schemaDefObj.getSchemaIdList() sqlGen = SqlGenAdmin(self.__verbose) sqlL = sqlGen.createDatabaseSQL(schemaDefObj.getDatabaseName()) for tableId in tableIdList: tableDefObj = schemaDefObj.getSchemaObject(tableId) sqlL.extend( sqlGen.createTableSQL( databaseName=schemaDefObj.getDatabaseName(), tableDefObj=tableDefObj)) logger.debug("Schema creation SQL string\n %s\n\n", "\n".join(sqlL)) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: myQ = MyDbQuery(dbcon=client, verbose=self.__verbose) # # Permit warnings to support "drop table if exists" for missing tables. # myQ.setWarning("ignore") ret = myQ.sqlCommand(sqlCommandList=sqlL) logger.debug("\n\n+INFO mysql server returns %r\n", ret) self.assertTrue(ret) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() # ------------- - ------------------------------------------------------------------------------------------- def testSchemaCreate(self): """Create table schema for BIRD, chemical component, and PDBx data.""" cD = self.__schP.makeSchemaDef("bird", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) # cD = self.__schP.makeSchemaDef("chem_comp", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) # # cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True) # sd = SchemaDefAccess(cD) self.__schemaCreate(sd) def testLoadBirdReference(self): try: cD = self.__schP.makeSchemaDef("bird", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList(contentType="bird") inputPathList.extend( self.__rpP.getLocatorObjList(contentType="bird_family")) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testReLoadBirdReference(self): try: cD = self.__schP.makeSchemaDef("bird", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList(contentType="bird") inputPathList.extend( self.__rpP.getLocatorObjList(contentType="bird_family")) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) sdl.load(inputPathList=inputPathList, loadType="batch-file") # logger.debug( "INFO BATCH FILE RELOAD TEST --------------------------------------------\n" ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file", deleteOpt="all") self.assertTrue(ok) # logger.debug( "\n\n\n+INFO BATCH INSERT RELOAD TEST --------------------------------------------\n" ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file", deleteOpt="selected") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadChemCompReference(self): try: cD = self.__schP.makeSchemaDef("chem_comp", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList( contentType="chem_comp") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-file") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skip("Disable test - schema not optimized for mysql limitations") def testLoadPdbxFiles(self): try: cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True) sd = SchemaDefAccess(cD) self.__schemaCreate(sd) inputPathList = self.__rpP.getLocatorObjList(contentType="pdbx") logger.debug("Input path list %r", inputPathList) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = SchemaDefLoader( self.__cfgOb, schemaDefObj=sd, dbCon=client, cachePath=self.__cachePath, workPath=self.__workPath, cleanUp=False, warnings="error", verbose=self.__verbose, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) ok = sdl.load(inputPathList=inputPathList, loadType="batch-insert", deleteOpt="all") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ObjectValidator(object): """Utilities to extract and update object from the document object server with validation.""" def __init__(self, cfgOb, objectAdapter=None, cachePath=".", useCache=True, **kwargs): self.__cfgOb = cfgOb self.__oAdapt = objectAdapter self.__resourceName = "MONGO_DB" _ = kwargs self.__statusList = [] self.__schP = SchemaProvider(self.__cfgOb, cachePath, useCache=useCache) self.__valInst = None def __getValidator(self, databaseName, collectionName, schemaLevel="full"): _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) valInst = Draft4Validator(cD, format_checker=FormatChecker()) return valInst def __validateObj(self, databaseName, collectionName, rObj, label=""): try: eCount = 0 tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous" for error in sorted(self.__valInst.iter_errors(rObj), key=str): logger.info( "Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message) logger.debug(">>> Failing object is %r", rObj) eCount += 1 except Exception as e: logger.exception("Validation failing %s", str(e)) return eCount def doTransform(self, **kwargs): desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # databaseName = kwargs.get("databaseName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("selectionQuery", {}) fetchLimit = kwargs.get("fetchLimit", None) # # tU = TimeUtil() updateId = kwargs.get("updateId", tU.getCurrentWeekSignature()) # docSelectList = self.__selectObjectIds(databaseName, collectionName, selectionQueryD) docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList ok = self.__transform(databaseName, collectionName, docSelectList) # if updateId: okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) return ok and okS def __selectObjectIds(self, databaseName, collectionName, selectionQueryD): """Return a list of object identifiers for the input selection query.""" try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName)) qD = {} if selectionQueryD: qD.update(selectionQueryD) selectL = ["_id"] dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD) logger.info("Selection %r fetch result count %d", selectL, len(dL)) except Exception as e: logger.exception("Failing with %s", str(e)) return dL # def __transform(self, databaseName, collectionName, docSelectList, logIncrement=100): """Return a list of object identifiers for the input selection query.""" # ok = True try: self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): numDoc = len(docSelectList) for ii, dD in enumerate(docSelectList, 1): if "_id" not in dD: continue rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"]) del rObj["_id"] # fOk = True if self.__oAdapt: self.__validateObj(databaseName, collectionName, rObj, label="Original") fOk, rObj = self.__oAdapt.filter(rObj) self.__validateObj(databaseName, collectionName, rObj, label="Updated") if fOk: rOk = mg.replace(databaseName, collectionName, rObj, dD) if rOk is None: tId = rObj[ "rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous" logger.error("%r %r (%r) failing", databaseName, collectionName, tId) # logger.info("rObj.keys() %r", list(rObj.keys())) # logger.info("rObj.items() %s", rObj.items()) rOk = False ok = ok and rOk # if ii % logIncrement == 0 or ii == numDoc: logger.info("Replace status %r object (%d of %d)", ok, ii, numDoc) # except Exception as e: logger.exception("Failing with %s", str(e)) return ok def getLoadStatus(self): return self.__statusList def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp): try: sFlag = "Y" if status else "N" desp = DataExchangeStatus() desp.setStartTime(tS=startTimestamp) desp.setObject(databaseName, collectionName) desp.setStatus(updateId=updateId, successFlag=sFlag) desp.setEndTime() self.__statusList.append(desp.getStatus()) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
class SqlGenTests(unittest.TestCase): def setUp(self): self.__verbose = True # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__sdu = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def testSQLMethods(self): schemaNames = ["pdbx_core"] dataTyping = "SQL" for schemaName in schemaNames: dD = self.__sdu.makeSchemaDef(schemaName, dataTyping=dataTyping, saveSchema=False) sD = SchemaDefAccess(dD) self.__testSchemaCreate(sD) self.__testImportExport(sD) self.__testSelectionAndConditions(sD) # def __getHelper(self, modulePath, **kwargs): aMod = __import__(modulePath, globals(), locals(), [""]) sys.modules[modulePath] = aMod # # Strip off any leading path to the module before we instaniate the object. mpL = modulePath.split(".") moduleName = mpL[-1] # aObj = getattr(aMod, moduleName)(**kwargs) return aObj def __testSchemaCreate(self, sD): """Test case - create table schema using input schema definition as an example """ try: tableIdList = sD.getSchemaIdList() myAd = SqlGenAdmin(self.__verbose) sqlL = [] for tableId in tableIdList: tableDefObj = sD.getSchemaObject(tableId) sqlL.extend( myAd.createTableSQL(databaseName=sD.getDatabaseName(), tableDefObj=tableDefObj)) logger.debug( "\n\n+SqlGenTests table creation SQL string\n %s\n\n", "\n".join(sqlL)) self.assertGreaterEqual(len(sqlL), 10) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __testImportExport(self, sD): """Test case - import and export commands -- """ try: databaseName = sD.getDatabaseName() tableIdList = sD.getSchemaIdList() myAd = SqlGenAdmin(self.__verbose) for tableId in tableIdList: tableDefObj = sD.getSchemaObject(tableId) exportPath = os.path.join(HERE, "test-output", tableDefObj.getName() + ".tdd") sqlExport = myAd.exportTable(databaseName, tableDefObj, exportPath=exportPath) logger.debug( "\n\n+SqlGenTests table export SQL string\n %s\n\n", sqlExport) sqlImport = myAd.importTable(databaseName, tableDefObj, importPath=exportPath) logger.debug( "\n\n+SqlGenTests table import SQL string\n %s\n\n", sqlImport) self.assertGreaterEqual(len(sqlImport), 100) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __testSelectionAndConditions(self, sD): """Test case - selection everything for a simple condition- """ try: # get delete attribute - # tableIdList = sD.getSchemaIdList() logger.debug("TableIdList %r", tableIdList) sqlGen = SqlGenQuery(schemaDefObj=sD, verbose=self.__verbose) for tableId in tableIdList: tableDefObj = sD.getSchemaObject(tableId) dAtId = tableDefObj.getDeleteAttributeId() if dAtId: sqlCondition = SqlGenCondition(schemaDefObj=sD, verbose=self.__verbose) sqlCondition.addValueCondition((tableId, dAtId), "EQ", ("D000001", "CHAR")) aIdList = sD.getAttributeIdList(tableId) for aId in aIdList: sqlGen.addSelectAttributeId(attributeTuple=(tableId, aId)) sqlGen.setCondition(sqlCondition) sqlGen.addOrderByAttributeId(attributeTuple=(tableId, dAtId)) sqlS = sqlGen.getSql() logger.debug( "\n\n+SqlGenTests table creation SQL string\n %s\n\n", sqlS) self.assertGreaterEqual(len(sqlS), 50) sqlGen.clear() else: logger.debug("Missing delete atttribe for table %r", tableId) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class CockroachDbLoaderCockroachDbTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(CockroachDbLoaderCockroachDbTests, self).__init__(methodName) self.__verbose = True self.__createFlag = False def setUp(self): self.__verbose = True self.__numProc = 2 self.__fileLimit = 100 self.__workPath = os.path.join(HERE, "test-output") self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) self.__resourceName = "COCKROACH_DB" self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) # self.__tableIdSkipD = {"ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True} self.__ioObj = IoAdapter(verbose=self.__verbose) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testConnection(self): try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: self.assertNotEqual(client, None) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaCreate(self): """Create table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaRemove(self): """Remove table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertBirdReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("bird") inputPathList.extend(self.__rpP.getLocatorObjList("bird_family")) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertManyBirdReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("bird") inputPathList.extend(self.__rpP.getLocatorObjList("bird_family")) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertChemCompReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("chem_comp") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertManyChemCompReference(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("chem_comp") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected") self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertPdbxExampleFiles(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("pdbx") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD) self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadInsertManyPdbxExampleFiles(self): try: sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) inputPathList = self.__rpP.getLocatorObjList("pdbx") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD) self.assertEqual(ret, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaCreateSQL(self, schemaDefObj): """Test case - create table schema using schema definition""" sqlL = [] try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb") dbName = schemaDefObj.getVersionedDatabaseName() sqlL = sqlGen.createDatabaseSQL(dbName) for tableId in tableIdList: tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return sqlL def __schemaCreate(self, schemaDefObj): """Test case - create table schema using schema definition""" ret = 0 try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb") dbName = schemaDefObj.getVersionedDatabaseName() sqlL = sqlGen.createDatabaseSQL(dbName) for tableId in tableIdList: tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL)) logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName()) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) # ret = crQ.sqlCommand(' '.join(sqlL)) logger.info("Schema create command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaRemove(self, schemaDefObj): """Test case - remove table schema using schema definition""" ret = 0 try: dbName = schemaDefObj.getVersionedDatabaseName() sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb") sqlL = sqlGen.removeDatabaseSQL(dbName) logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL)) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) # ret = crQ.sqlCommand(' '.join(sqlL)) logger.debug("Schema remove command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument( "--update_chem_comp_ref", default=False, action="store_true", help="Update schema for Chemical Component reference definitions") parser.add_argument( "--update_chem_comp_core_ref", default=False, action="store_true", help="Update core schema for Chemical Component reference definitions") parser.add_argument( "--update_bird_chem_comp_ref", default=False, action="store_true", help="Update schema for Bird Chemical Component reference definitions") parser.add_argument( "--update_bird_chem_comp_core_ref", default=False, action="store_true", help= "Update core schema for Bird Chemical Component reference definitions") parser.add_argument("--update_bird_ref", default=False, action="store_true", help="Update schema for Bird reference definitions") parser.add_argument( "--update_bird_family_ref", default=False, action="store_true", help="Update schema for Bird Family reference definitions") parser.add_argument("--update_pdbx", default=False, action="store_true", help="Update schema for PDBx entry data") parser.add_argument("--update_pdbx_core", default=False, action="store_true", help="Update schema for PDBx core entry/entity data") parser.add_argument( "--update_pdbx_comp_model_core", default=False, action="store_true", help="Update schema for PDBx computational model core entry/entity data" ) # parser.add_argument("--update_repository_holdings", default=False, action="store_true", help="Update schema for repository holdings") parser.add_argument("--update_entity_sequence_clusters", default=False, action="store_true", help="Update schema for entity sequence clusters") parser.add_argument("--update_data_exchange", default=False, action="store_true", help="Update schema for data exchange status") parser.add_argument("--update_ihm_dev", default=False, action="store_true", help="Update schema for I/HM dev entry data") parser.add_argument("--update_drugbank_core", default=False, action="store_true", help="Update DrugBank schema") # parser.add_argument( "--update_config_all", default=False, action="store_true", help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)") parser.add_argument( "--update_config_deployed", default=False, action="store_true", help= "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)") parser.add_argument( "--update_config_test", default=False, action="store_true", help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)") # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") # parser.add_argument("--cache_path", default=None, help="Schema cache directory path") parser.add_argument( "--encoding_types", default=None, help="Schema encoding (rcsb|json|bson) (comma separated)") parser.add_argument( "--validation_levels", default=None, help="Schema validation level (full|min) (comma separated)") parser.add_argument("--compare_only", default=False, action="store_true", help="Perform comparison with cached schema") # parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument( "--mock", default=False, action="store_true", help="Use MOCK repository configuration for dependencies and testing") # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files") args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name cachePath = args.cache_path compareOnly = args.compare_only # encodingTypes = args.encoding_types.split( ",") if args.encoding_types else [] validationLevels = args.validation_levels.split( ",") if args.validation_levels else [] dataTypingList = ["ANY", "SQL"] if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # databaseNameList = [] if args.update_chem_comp_ref: databaseNameList.append("chem_comp") if args.update_bird_chem_comp_ref: databaseNameList.append("bird_chem_comp") if args.update_chem_comp_core_ref: databaseNameList.append("chem_comp_core") if args.update_bird_chem_comp_core_ref: databaseNameList.append("bird_chem_comp_core") if args.update_bird_ref: databaseNameList.append("bird") if args.update_bird_family_ref: databaseNameList.append("bird_family") if args.update_pdbx: databaseNameList.append("pdbx") if args.update_pdbx_core: databaseNameList.append("pdbx_core") if args.update_pdbx_comp_model_core: databaseNameList.append("pdbx_comp_model_core") if args.update_repository_holdings: databaseNameList.append("repository_holdings") if args.update_entity_sequence_clusters: databaseNameList.append("sequence_clusters") if args.update_data_exchange: databaseNameList.append("data_exchange") if args.update_ihm_dev: databaseNameList.append("ihm_dev") if args.update_drugbank_core: databaseNameList.append("drugbank_core") if args.update_config_deployed: databaseNameList = cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_DEPLOYED", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_DEPLOYED", sectionName="database_catalog_configuration") if args.update_config_all: databaseNameList = cfgOb.getList( "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_ALL", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_ALL", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration") if args.update_config_test: databaseNameList = cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # scnD = cfgOb.get("document_collection_names", sectionName="document_helper_configuration") # databaseNameList = list(set(databaseNameList)) logger.debug("Collections %s", list(scnD.items())) logger.debug("databaseNameList %s", databaseNameList) if compareOnly: schP = SchemaProvider(cfgOb, cachePath, useCache=True) difPathList = [] for databaseName in databaseNameList: for dataTyping in dataTypingList: logger.debug("Building schema %s with types %s", databaseName, dataTyping) pth = schP.schemaDefCompare(databaseName, dataTyping) if pth: difPathList.append(pth) if difPathList: logger.info("Schema definition difference path list %r", difPathList) difPathList = [] for databaseName in databaseNameList: dD = schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=False) sD = SchemaDefAccess(dD) for cd in sD.getCollectionInfo(): collectionName = cd["NAME"] for encodingType in encodingTypes: if encodingType.lower() != "json": continue for level in validationLevels: pth = schP.jsonSchemaCompare(databaseName, collectionName, encodingType, level) if pth: difPathList.append(pth) if difPathList: logger.info("JSON schema difference path list %r", difPathList) else: schP = SchemaProvider(cfgOb, cachePath, useCache=False) for databaseName in databaseNameList: for encodingType in encodingTypes: if encodingType == "rcsb": for dataTyping in dataTypingList: logger.info( "Creating schema definition for content type %s data typing %s", databaseName, dataTyping) schP.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: if databaseName in scnD: for dD in scnD[databaseName]: collectionName = dD["NAME"] for validationLevel in validationLevels: logger.info( "Creating %r schema for content type %s collection %s", encodingType, databaseName, collectionName) schP.makeSchema(databaseName, collectionName, encodingType=encodingType, level=validationLevel, saveSchema=True)
class SchemaDefAccessTests(unittest.TestCase): def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True, clearPath=False) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testAccess(self): databaseNames = ["pdbx_core", "bird_chem_comp_core"] dataTypingList = ["ANY", "SQL"] for databaseName in databaseNames: for dataTyping in dataTypingList: self.__testAccess(databaseName, dataTyping) def __testAccess(self, databaseName, dataTyping): try: sD = self.__schP.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=False) ok = self.__testAccessors(sD) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return {} def __testAccessors(self, schemaDef): """Verify data and accessor mapping -""" sd = SchemaDefAccess(schemaDef) logger.debug("Schema name %s", sd.getName()) logger.debug("Schema name %s", sd.getAppName()) logger.debug("Database name %s", sd.getDatabaseName()) logger.debug("Versioned database name %s", sd.getVersionedDatabaseName()) logger.debug("Collection info %r", sd.getCollectionInfo()) for dS in sd.getDataSelectorNames(): logger.debug("Selector %s %r", dS, sd.getDataSelectors(dS)) collectionInfoL = sd.getCollectionInfo() for dD in collectionInfoL: collectionName = dD["NAME"] logger.debug("Collection excluded %r", sd.getCollectionExcluded(collectionName)) logger.debug("Collection included %r", sd.getCollectionSelected(collectionName)) logger.debug("Collection document key attribute names %r", sd.getDocumentKeyAttributeNames(collectionName)) schemaIdList = sd.getSchemaIdList() for schemaId in schemaIdList: # aIdL = sd.getAttributeIdList(schemaId) tObj = sd.getSchemaObject(schemaId) attributeIdList = tObj.getAttributeIdList() self.assertEqual(len(aIdL), len(attributeIdList)) attributeNameList = tObj.getAttributeNameList() logger.debug("Ordered attribute Id list %s", str(attributeIdList)) logger.debug("Ordered attribute name list %s", str(attributeNameList)) # mAL = tObj.getMapAttributeNameList() logger.debug("Ordered mapped attribute name list %s", str(mAL)) mAL = tObj.getMapAttributeIdList() logger.debug("Ordered mapped attribute id list %s", str(mAL)) cL = tObj.getMapInstanceCategoryList() logger.debug("Mapped category list %s", str(cL)) for cV in cL: aL = tObj.getMapInstanceAttributeList(cV) logger.debug("Mapped attribute list in %s : %s", cV, str(aL)) return True
class ChemRefEtlWorker(object): """Prepare and load chemical reference data collections.""" def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, verbose=False): self.__cfgOb = cfgOb self.__cachePath = cachePath self.__useCache = useCache self.__readBackCheck = readBackCheck self.__numProc = numProc self.__chunkSize = chunkSize self.__documentLimit = documentLimit # self.__resourceName = "MONGO_DB" self.__verbose = verbose self.__statusList = [] self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache) # def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp): try: sFlag = "Y" if status else "N" desp = DataExchangeStatus() desp.setStartTime(tS=startTimestamp) desp.setObject(databaseName, collectionName) desp.setStatus(updateId=updateId, successFlag=sFlag) desp.setEndTime() self.__statusList.append(desp.getStatus()) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def load(self, updateId, extResource, loadType="full"): """Load chemical reference integrated data for the input external resource-""" try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # if extResource == "DrugBank": databaseName = "drugbank_core" configName = self.__cfgOb.getDefaultSectionName() user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName) pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName) # dbP = DrugBankProvider(cachePath=self.__cachePath, useCache=self.__useCache, username=user, password=pw) # crExt = ChemRefExtractor(self.__cfgOb) idD = crExt.getChemCompAccessionMapping(extResource) dList = dbP.getDocuments(mapD=idD) # logger.info("Resource %r extracted mapped document length %d", extResource, len(dList)) logger.debug("Objects %r", dList[:2]) sD, _, collectionList, _ = self.__schP.getSchemaInfo( databaseName) collectionName = collectionList[ 0] if collectionList else "unassigned" indexL = sD.getDocumentIndex(collectionName, "primary") logger.info("Database %r collection %r index attributes %r", databaseName, collectionName, indexL) # collectionVersion = sD.getCollectionVersion(collectionName) addValues = {"_schema_version": collectionVersion} # addValues = {} # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def getLoadStatus(self): return self.__statusList
class SchemaDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__numProc = 2 # self.__fileLimit = 200 self.__fileLimit = None self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName) # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__verbose = True # self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files") self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files") self.__export = True # #self.__extraOpts = None # The following for extended parent/child info - self.__extraOpts = 'addParentRefs|addPrimaryKey' # self.__alldatabaseNameD = { "ihm_dev": ["ihm_dev"], "pdbx": ["pdbx", "pdbx_ext"], "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird": ["bird"], "bird_family": ["family"], "chem_comp": ["chem_comp"], "bird_chem_comp": ["bird_chem_comp"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__databaseNameD = { "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]} # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]} # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]} self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsRepo(self): # schemaLevel = "min" schemaLevel = "full" inputPathList = None eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertLessEqual(eCount, 1) @unittest.skip("Disable troubleshooting test") def testValidateOptsList(self): schemaLevel = "min" inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list") # inputPathList = glob.glob(self.__testDirPath + "/*.cif") if not inputPathList: return True databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]} for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)): if ii < 5: continue eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmRepo(self): schemaLevel = "min" inputPathList = None self.__export = True databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmList(self): #schemaLevel = "full" schemaLevel = "min" inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif") if not inputPathList: return True #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None): # eCount = 0 for databaseName in databaseNameD: mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes) for collectionName in databaseNameD[databaseName]: cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts) # dL, cnL = self.__testPrepDocumentsFromContainers( pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes ) # Raises exceptions for schema compliance. try: Draft4Validator.check_schema(cD) except Exception as e: logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e)) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 #for error in sorted(valInfo.iter_errors(dD), key=str): # logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message) # logger.debug("Failing document %d : %r", ii, list(dD.items())) # eCount += 1 # cCount += 1 #if cCount > 0: # logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount) except Exception as e: logger.exception("Validation processing error %s", str(e)) return eCount def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None): """Test case - create loadable PDBx data from repository files """ try: sd, _, _, _ = self.__schP.getSchemaInfo(databaseName) # dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False) dictApi = dP.getApiByName(databaseName) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") #self.__mU.doExport(savePath, [container], fmt="mmcif") # tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, containerNameList, _ = sdp.processDocuments( containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName ) docList = sdp.addDocumentPrivateAttributes(docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName) # mergeS = "-".join(mergeContentTypes) if mergeContentTypes else "" if self.__export and docList: # for ii, doc in enumerate(docList[:1]): for ii, doc in enumerate(docList): cn = containerNameList[ii] fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS)) self.__mU.doExport(fp, [doc], fmt="json", indent=3) logger.debug("Exported %r", fp) # return docList, containerNameList except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDefBuildTests(unittest.TestCase): def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False) # self.__validationLevels = self.__cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") self.__encodingTypes = self.__cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # buildAll = True if buildAll: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") # else: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") # self.__databaseNameList = ["repository_holdings"] self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") # self.__databaseNameList = ["sequence_clusters"] self.__saveSchema = True self.__compareDefSchema = False self.__compareSchema = False # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testBuildSchemaDefs(self): try: for databaseName in self.__databaseNameList: for dataTyping in self.__dataTypingList: logger.debug("Building schema %s with types %s", databaseName, dataTyping) self.__schP.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=self.__saveSchema) if self.__compareDefSchema: self.__schP.schemaDefCompare(databaseName, dataTyping) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testBuildCollectionSchema(self): schemaDifPathList = [] for databaseName in self.__databaseNameList: dD = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=False) sD = SchemaDefAccess(dD) for cd in sD.getCollectionInfo(): collectionName = cd["NAME"] for encodingType in self.__encodingTypes: if encodingType.lower() == "rcsb": continue for level in self.__validationLevels: self.__schP.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=self.__saveSchema) if self.__compareSchema and encodingType.lower( ) == "json": pth = self.__schP.jsonSchemaCompare( databaseName, collectionName, encodingType, level) if pth: schemaDifPathList.append(pth) if schemaDifPathList: logger.info("Path dif list %r", schemaDifPathList) def testCompareSchema(self): databaseName = "pdbx_core" collectionName = "pdbx_core_entry" encodingType = "json" level = "full" # oldPath = os.path.join( HERE, "test-saved-output", "json-full-db-pdbx_core-col-pdbx_core_entry.json") mU = MarshalUtil(workPath=os.path.join(HERE, "test-output")) sOld = mU.doImport(oldPath, fmt="json") sNew = self.__schP.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level) numDif, difD = self.__schP.schemaCompare(sOld, sNew) logger.debug("numDiffs %d", numDif) self.assertGreaterEqual(numDif, 141) self.assertGreaterEqual(len(difD["changed"]), 160) logger.debug("difD %r", difD) @unittest.skip("Deprecated test") def testCompareSchemaCategories(self): """Compare common categories across schema definitions.""" try: sdCc = SchemaDefAccess( self.__schP.makeSchemaDef("chem_comp_core", dataTyping="ANY", saveSchema=False)) sdBcc = SchemaDefAccess( self.__schP.makeSchemaDef("bird_chem_comp_core", dataTyping="ANY", saveSchema=False)) # logger.info("") for schemaId in ["CHEM_COMP", "PDBX_CHEM_COMP_AUDIT"]: atCcL = sdCc.getAttributeIdList(schemaId) atBCcL = sdBcc.getAttributeIdList(schemaId) logger.debug("%s attributes (%d) %r", schemaId, len(atCcL), atCcL) logger.debug("%s attributes (%d) %r", schemaId, len(atBCcL), atBCcL) sDif = set(atCcL) - set(atBCcL) if sDif: logger.info("For %s attribute differences %r", schemaId, sDif) self.assertEqual(len(sDif), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testBuildColSchemaWithRefs(self): for databaseName in ["ihm_dev_full"]: dD = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=False) sD = SchemaDefAccess(dD) for cd in sD.getCollectionInfo(): collectionName = cd["NAME"] for schemaType in self.__encodingTypes: if schemaType.lower() == "rcsb": continue for level in self.__validationLevels: self.__schP.makeSchema( databaseName, collectionName, encodingType=schemaType, level=level, saveSchema=True, extraOpts="addParentRefs|addPrimaryKey")
class ChemRefDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") # self.__configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=self.__configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateFull(self): self.__validateChemRef("DrugBank", schemaLevel="full") def __validateChemRef(self, extResource, schemaLevel="full"): eCount = 0 if extResource == "DrugBank": schemaName = "drugbank_core" collectionNames = ["drugbank_core"] user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=self.__configName) pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=self.__configName) # cacheDir = self.__cfgOb.get("DRUGBANK_CACHE_DIR", sectionName=self.__configName) dbP = DrugBankProvider(cachePath=self.__cachePath, useCache=True, username=user, password=pw) # idD = dbP.getMapping() # crExt = ChemRefExtractor(self.__cfgOb) # idD = crExt.getChemCompAccesionMapping(extResource) dList = dbP.getDocuments() logger.info("Validating %d Drugbank documents", len(dList)) eCount = self.__validate(schemaName, collectionNames, dList, schemaLevel=schemaLevel) return eCount def __validate(self, databaseName, collectionNames, dList, schemaLevel="full"): eCount = 0 for collectionName in collectionNames: _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) for ii, dD in enumerate(dList): logger.debug("Database %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info( "database %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message) logger.info(">>> failing object is %r", dD) eCount += 1 cCount += 1 # logger.debug("database %s collection %s count %d", databaseName, collectionName, cCount) except Exception as e: logger.exception("Validation error %s", str(e)) return eCount
class SchemaDefDataPrepTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefDataPrepTests, self).__init__(methodName) self.__loadPathList = [] self.__verbose = True def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__numProc = 2 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__outputPath = os.path.join(HERE, "test-output") self.__savedOutputPath = os.path.join(HERE, "test-saved-output") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE", sectionName=configName, default="local") self.__fileLimit = 100 if self.__discoveryMode == "local" else 10 self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__chemCompMockLen = 24 self.__pdbxMockLen = 30 # removes timestamped data items to allow diffs.) excludeExtras = ["rcsb_load_status"] # excludeExtras = [] # self.__verbose = True self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__exportFlag = True self.__diffFlag = False # self.__simpleTestCaseList = [ { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_no_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeCol, "styleType": "columnwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 0, }, ] # self.__fullTestCaseList = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, { "contentType": "bird_chem_comp_core", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": None, "rejectLength": 2, "excludeExtras": excludeExtras, }, ] # self.__fullTestCaseListA = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __timeStep(self, msg): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", msg, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSimpleSchemaDefDataPrep(self): for tcD in self.__simpleTestCaseList: rejectLength = 0 if self.__discoveryMode == "remote" else tcD[ "rejectLength"] mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[ "mockLength"] if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote": logger.info("Skipping %r in discovery mode %r", tcD["contentType"], self.__discoveryMode) continue self.__simpleSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], mockLength, rejectLength=rejectLength, mergeContentTypes=tcD["mergeContentTypes"]) def testFullSchemaDefDataPrep(self): for tcD in self.__fullTestCaseList: rejectLength = 0 if self.__discoveryMode == "remote" else tcD[ "rejectLength"] mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[ "mockLength"] if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote": logger.info("Skipping %r in discovery mode %r", tcD["contentType"], self.__discoveryMode) continue self.__fullSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], mockLength, rejectLength=rejectLength, mergeContentTypes=tcD["mergeContentTypes"], excludeExtras=tcD["excludeExtras"], ) def __simpleSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None): """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: dataSelectors = dataSelectors if dataSelectors else [ "PUBLIC_RELEASE" ] dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType, dataTyping="ANY") dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) # logger.debug("For %s mock length %d length of path list %d\n", contentType, mockLength, len(inputPathList)) self.assertGreaterEqual(len(inputPathList), mockLength) tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments( inputPathList, styleType=styleType, filterType=filterType, dataSelectors=dataSelectors) logger.debug( "For %s mock length %d reject length %d length of tddl list %d\n", contentType, mockLength, rejectLength, len(tableDataDictList)) self.assertGreaterEqual(len(tableDataDictList), mockLength - rejectLength) self.assertGreaterEqual(len(containerNameList), mockLength - rejectLength) if rejectList: logger.debug("For %s rejecting components %r", contentType, rejectList) # self.assertEqual(len(rejectList), rejectLength) fName = "simple-prep-%s-%s.json" % (contentType, styleType) if self.__exportFlag: fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, tableDataDictList, fmt="json", indent=3) if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(tableDataDictList)) # jD = diff(refDocList, tableDataDictList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, styleType, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __logDocumentOrder(self, docList): for doc in docList: logger.debug("keys %r", list(doc.keys())) def __filterDocuments(self, docList, excludeList=None): excludeList = excludeList if excludeList else [] for doc in docList: for excl in excludeList: if excl in doc: del doc[excl] def __fullSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None, excludeExtras=None): """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: excludeExtras = excludeExtras if excludeExtras else [] _ = mockLength _ = rejectLength dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, collectionNameList, _ = self.__schP.getSchemaInfo( databaseName=contentType, dataTyping="ANY") # dP = DictionaryApiProviderWrapper(self.__cachePath, cfgOb=self.__cfgOb, configName=self.__configName, useCache=True) dictApi = dP.getApiByName(contentType) # rP = DictMethodResourceProvider( self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) # for collectionName in collectionNameList: tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, _, _ = sdp.processDocuments( containerList, styleType=styleType, sliceFilter=sliceFilter, filterType=filterType, dataSelectors=dataSelectors, collectionName=collectionName) docList = sdp.addDocumentPrivateAttributes( docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates( docList, collectionName) # Special exclusions for the test harness. (removes timestamped data items to allow diffs.) self.__filterDocuments(docList, excludeExtras) mergeS = "-".join( mergeContentTypes) if mergeContentTypes else "" fName = "full-prep-%s-%s-%s-%s.json" % ( contentType, collectionName, mergeS, styleType) if self.__exportFlag: self.__logDocumentOrder(docList) fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, docList, fmt="json", indent=3) logger.debug("Exported %r", fPath) # if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(docList)) logger.debug("For %s %s len refDocList %d", contentType, collectionName, len(refDocList)) logger.debug("For %s %s len docList %d", contentType, collectionName, len(docList)) jD = diff(refDocList, docList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, collectionName, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ClusterDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__dataSetId = "2018_23" self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) # self.__levels = ['100', '95', '90', '70', '50', '30'] self.__levels = ["100"] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsStrict(self): updateId = self.__updateId validationLevel = "full" eCount = self.__testValidateOpts(updateId, validationLevel=validationLevel) logger.info("Total validation errors validation level %s : %d", validationLevel, eCount) self.assertTrue(eCount <= 1) def __testValidateOpts(self, updateId, validationLevel="full"): _ = updateId databaseNames = ["sequence_clusters"] collectionNames = {"sequence_clusters": ["cluster_provenance", "cluster_members", "entity_members"]} # eCount = 0 for databaseName in databaseNames: for collectionName in collectionNames[databaseName]: _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=validationLevel, saveSchema=True) # dL = self.__getSequenceClusterData(collectionName, levels=self.__levels, dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) for _, dD in enumerate(dL): # logger.debug("Schema %s collection %s document %d" % (schemaName, collectionName, ii)) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info("schema %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message) logger.info(">>> failing object is %r", dD) eCount += 1 cCount += 1 # logger.debug("schema %s collection %s count %d", databaseName, collectionName, cCount) except Exception as e: logger.exception("Validation error %s", str(e)) return eCount def __fetchProvenance(self): """Test case for fetching a provenance dictionary content.""" try: provKeyName = "rcsb_entity_sequence_cluster_prov" provU = ProvenanceProvider(self.__cfgOb, self.__cachePath, useCache=True) pD = provU.fetch() return pD[provKeyName] if provKeyName in pD else {} except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __getSequenceClusterData(self, collectionName, dataSetId=None, dataLocator=None, levels=None): """Test extraction on an example sequence cluster data set.""" try: # if collectionName == "cluster_provenance": return [self.__fetchProvenance()] # entitySchemaName = "rcsb_entity_sequence_cluster_list" clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list" cdp = ClusterDataPrep(workPath=self.__cachePath, entitySchemaName=entitySchemaName, clusterSchemaName=clusterSchemaName) cifD, docBySequenceD, docByClusterD = cdp.extract(dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity") self.assertEqual(len(cifD), 1) self.assertEqual(len(docBySequenceD), 1) self.assertEqual(len(docByClusterD), 1) if collectionName == "entity_members": return docBySequenceD[entitySchemaName] elif collectionName == "cluster_members": return docByClusterD[clusterSchemaName] except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return None
class SchemaDefCompareTests(unittest.TestCase): skipFlag = True def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__validationLevels = self.__cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") self.__encodingTypes = self.__cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # buildAll = True if buildAll: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") # else: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") # self.__databaseNameList = ["repository_holdings"] self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) @unittest.skipIf(skipFlag, "Troubleshooting test") def testCompareSchemaDefs(self): try: difPathList = [] for databaseName in self.__databaseNameList: for dataTyping in self.__dataTypingList: logger.debug("Building schema %s with types %s", databaseName, dataTyping) pth = self.__schP.schemaDefCompare(databaseName, dataTyping) if pth: difPathList.append(pth) if difPathList: logger.info("Schema definition difference path list %r", [os.path.split(pth)[1] for pth in difPathList]) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skipIf(skipFlag, "Troubleshooting test") def testCompareCollectionSchema(self): try: difPathList = [] for databaseName in self.__databaseNameList: dD = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=False) sD = SchemaDefAccess(dD) for cd in sD.getCollectionInfo(): collectionName = cd["NAME"] for encodingType in self.__encodingTypes: if encodingType.lower() != "json": continue for level in self.__validationLevels: pth = self.__schP.jsonSchemaCompare( databaseName, collectionName, encodingType, level) if pth: difPathList.append(pth) if difPathList: logger.info("JSON schema difference path list %r", [os.path.split(pth)[1] for pth in difPathList]) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDefLoadercrateDbMultiTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefLoadercrateDbMultiTests, self).__init__(methodName) self.__verbose = True self.__createFlag = True def setUp(self): self.__verbose = True self.__numProc = 2 self.__fileLimit = 100 self.__chunkSize = 0 self.__workPath = os.path.join(HERE, "test-output") self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) self.__resourceName = "CRATE_DB" self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) # # self.__tableIdSkipD = { "ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True, "__LOAD_STATUS__": True } self.__ioObj = IoAdapter(verbose=self.__verbose) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testConnection(self): try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: self.assertNotEqual(client, None) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaCreate(self): """Create table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaCreate(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testSchemaRemove(self): """Remove table schema (live) for BIRD, chemical component, and PDBx data.""" try: sd, _, _, _ = self.__schP.getSchemaInfo("bird") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # sd, _, _, _ = self.__schP.getSchemaInfo("pdbx") ret = self.__schemaRemove(schemaDefObj=sd) self.assertEqual(ret, True) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadChemCompMulti(self): self.__testLoadFilesMulti("chem_comp") def testLoadBirdMulti(self): self.__testLoadFilesMulti("bird") def testLoadPdbxMulti(self): self.__testLoadFilesMulti("pdbx") def __getPathList(self, fType): pathList = [] if fType == "chem_comp": pathList = self.__rpP.getLocatorObjList("chem_comp") elif fType == "bird": pathList = self.__rpP.getLocatorObjList("bird") pathList.extend(self.__rpP.getLocatorObjList("bird_family")) elif fType == "pdbx": pathList = self.__rpP.getLocatorObjList("pdbx") return pathList def loadInsertMany(self, dataList, procName, optionsD, workingDir): try: _ = workingDir ret = None sd = optionsD["sd"] skipD = optionsD["skip"] ioObj = IoAdapter(verbose=self.__verbose) logger.debug("%s pathlist %r", procName, dataList) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: sdl = CrateDbLoader(schemaDefObj=sd, ioObj=ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose) ret = sdl.load(inputPathList=dataList, loadType="crate-insert-many", deleteOpt="selected", tableIdSkipD=skipD) # all or nothing here if ret: return dataList, dataList, [] else: return [], [], [] except Exception as e: logger.info("Failing with dataList %r", dataList) logger.exception("Failing with %s", str(e)) return [], [], [] def __testLoadFilesMulti(self, contentType): """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)""" numProc = self.__numProc chunkSize = self.__chunkSize try: # sd, _, _, _ = self.__schP.getSchemaInfo(contentType) if self.__createFlag: self.__schemaCreate(schemaDefObj=sd) optD = {} optD["sd"] = sd if contentType == "pdbx": optD["skip"] = self.__tableIdSkipD else: optD["skip"] = {} # pathList = self.__getPathList(fType=contentType) logger.debug("Input path list %r", pathList) mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="loadInsertMany") ok, _, _, _ = mpu.runMulti(dataList=pathList, numProc=numProc, numResults=1, chunkSize=chunkSize) self.assertEqual(ok, True) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaCreate(self, schemaDefObj): """Test case - create table schema using schema definition""" ret = 0 try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb") sqlL = [] for tableId in tableIdList: if tableId in self.__tableIdSkipD: continue tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend( sqlGen.createTableSQL( databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) logger.debug("Schema creation SQL string\n %s\n\n", "\n".join(sqlL)) logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName()) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) logger.debug("Schema create command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __schemaRemove(self, schemaDefObj): """Test case - remove table schema using schema definition""" ret = 0 try: tableIdList = schemaDefObj.getTableIdList() sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb") sqlL = [] for tableId in tableIdList: if tableId in self.__tableIdSkipD: continue tableDefObj = schemaDefObj.getTable(tableId) sqlL.extend( sqlGen.dropTableSQL( databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj)) sqlL.extend( sqlGen.dropTableSQL( databaseName=schemaDefObj.getDatabaseName(), tableDefObj=tableDefObj)) logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL)) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose) ret = crQ.sqlCommandList(sqlCommandList=sqlL) logger.debug("Schema remove command returns %r\n", ret) return ret # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class RepoHoldingsDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" self.__export = False # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__mU = MarshalUtil(workPath=self.__cachePath) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsStrict(self): updateId = self.__updateId schemaLevel = "full" eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertTrue(eCount <= 1) @unittest.skip("Troubleshooting test") def testValidateOptsMin(self): updateId = self.__updateId schemaLevel = "min" eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertTrue(eCount <= 1) def __testValidateOpts(self, updateId, schemaLevel="full"): schemaNames = ["repository_holdings"] collectionNames = { "repository_holdings": [ "repository_holdings_update_entry", "repository_holdings_current_entry", "repository_holdings_unreleased_entry", "repository_holdings_removed_entry", "repository_holdings_combined_entry", ], "entity_sequence_clusters": ["cluster_members", "cluster_provenance", "entity_members"], } # eCount = 0 for schemaName in schemaNames: for collectionName in collectionNames[schemaName]: _ = self.__schP.makeSchemaDef(schemaName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(schemaName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) dL = self.__getRepositoryHoldingsDocuments( schemaName, collectionName, updateId) if self.__export: savePath = os.path.join(HERE, "test-output", collectionName + ".json") self.__mU.doExport(savePath, dL, fmt="json", indent=3) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", schemaName, collectionName, ii) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info( "schema %s collection %s path %s error: %s", schemaName, collectionName, error.path, error.message) logger.info(">>>") logger.info(">>> failing object is %r", dD) logger.info(">>>") eCount += 1 cCount += 1 # logger.debug("schema %s collection %s count %d", schemaName, collectionName, cCount) except Exception as e: logger.exception("Validation error %s", str(e)) return eCount def __getRepositoryHoldingsDocuments(self, schemaName, collectionName, updateId): """Test loading and processing operations for legacy holdings and status data.""" rL = [] try: rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb, sandboxPath=self.__sandboxPath, workPath=self.__cachePath) if collectionName == "repository_holdings_update_entry": rL = rhdp.getHoldingsUpdateEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("update data length %r", len(rL)) # elif collectionName == "repository_holdings_current_entry": rL = rhdp.getHoldingsCurrentEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("holdings data length %r", len(rL)) # elif collectionName == "repository_holdings_unreleased_entry": rL = rhdp.getHoldingsUnreleasedEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("unreleased data length %r", len(rL)) # elif collectionName in ["repository_holdings_removed_entry"]: rL = rhdp.getHoldingsRemovedEntry(updateId=updateId) if collectionName == "repository_holdings_removed": self.assertGreaterEqual(len(rL), 10) logger.debug("removed data length %r", len(rL)) elif collectionName == "repository_holdings_combined_entry": rL = rhdp.getHoldingsCombinedEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("holdings data length %r", len(rL)) # except Exception as e: logger.exception("%s %s failing with %s", schemaName, collectionName, str(e)) self.fail() return rL
class UniProtCoreEtlWorker(object): """Prepare and load UniProt 'core' sequence reference data collections.""" def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False): self.__cfgOb = cfgOb self.__cachePath = cachePath self.__useCache = useCache self.__readBackCheck = readBackCheck self.__numProc = numProc self.__chunkSize = chunkSize self.__documentLimit = documentLimit # self.__resourceName = "MONGO_DB" self.__verbose = verbose self.__statusList = [] self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache) self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb) self.__valInst = None self.__doValidate = doValidate # def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp): try: sFlag = "Y" if status else "N" desp = DataExchangeStatus() desp.setStartTime(tS=startTimestamp) desp.setObject(databaseName, collectionName) desp.setStatus(updateId=updateId, successFlag=sFlag) desp.setEndTime() self.__statusList.append(desp.getStatus()) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def __getReferenceSequenceProvider(self): """ """ try: rsaP = ReferenceSequenceAssignmentProvider( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", useCache=self.__useCache, cachePath=self.__cachePath, fetchLimit=self.__documentLimit, siftsAbbreviated="TEST", ) ok = rsaP.testCache() return ok, rsaP except Exception as e: logger.exception("Failing with %s", str(e)) return None def load(self, updateId, extResource, loadType="full"): """Load sequence reference data""" try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # dList = indexL = [] databaseName = collectionName = collectionVersion = None # if extResource == "UniProt": databaseName = "uniprot_core" # configName = self.__cfgOb.getDefaultSectionName() # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName())) # ok, rsP = self.__getReferenceSequenceProvider() if not ok: return False # dList = rsP.getDocuments() logger.info("Resource %r extracted mapped document length %d", extResource, len(dList)) logger.debug("Objects %r", dList[:2]) # cDL = self.__docHelper.getCollectionInfo(databaseName) collectionName = cDL[0]["NAME"] collectionVersion = cDL[0]["VERSION"] indexL = self.__docHelper.getDocumentIndexAttributes( collectionName, "primary") logger.info( "Database %r collection %r version %r index attributes %r", databaseName, collectionName, collectionVersion, indexL) addValues = {} else: logger.error("Unsupported external resource %r", extResource) # if self.__doValidate: self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full") for dObj in dList: self.__validateObj(databaseName, collectionName, dObj, label="Original") # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues) okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) return ok and okS except Exception as e: logger.exception("Failing with %s", str(e)) return False def getLoadStatus(self): return self.__statusList def __getValidator(self, databaseName, collectionName, schemaLevel="full"): # _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) # cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) logger.info("Fetch schema for %r %r validation level %r", databaseName, collectionName, schemaLevel) cD = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) valInst = Draft4Validator(cD, format_checker=FormatChecker()) return valInst def __validateObj(self, databaseName, collectionName, rObj, label=""): try: eCount = 0 tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous" for error in sorted(self.__valInst.iter_errors(rObj), key=str): logger.info( "Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message) logger.debug(">>> Failing object is %r", rObj) if "rcsb_uniprot_feature" in rObj: for dd in rObj["rcsb_uniprot_feature"]: if "feature_id" in dd: logger.info("feature_id %r", dd["feature_id"]) else: logger.info("no feature_id keys %r", sorted(dd.keys())) logger.info("description %r", dd["description"]) eCount += 1 except Exception as e: logger.exception("Validation failing %s", str(e)) return eCount
class SchemaProviderTests(unittest.TestCase): def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False) # self.__validationLevels = self.__cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") self.__encodingTypes = self.__cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # buildAll = True if buildAll: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") # else: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") # self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSchemaAccessDefault(self): for databaseName in self.__databaseNameList: cDL = self.__docHelper.getCollectionInfo(databaseName) for cD in cDL: collectionName = cD["NAME"] for encodingType in self.__encodingTypes: if encodingType.lower() == "rcsb": continue for level in self.__validationLevels: logger.debug("Loading ->%s %s %s %s", databaseName, collectionName, encodingType, level) sD = self.__schP.getJsonSchema( databaseName, collectionName, encodingType=encodingType, level=level) self.assertTrue(sD is not None)
class DocumentLoader(object): def __init__( self, cfgOb, cachePath, resourceName="MONGO_DB", numProc=4, chunkSize=15, documentLimit=None, verbose=False, readBackCheck=False, maxStepLength=2000, schemaRebuildFlag=False, ): self.__verbose = verbose # # Limit the load length of each file type for testing - Set to None to remove - self.__documentLimit = documentLimit self.__maxStepLength = maxStepLength # # Controls for multiprocessing execution - self.__numProc = numProc self.__chunkSize = chunkSize # self.__cfgOb = cfgOb self.__resourceName = resourceName # self.__cachePath = cachePath if cachePath else "." self.__schP = SchemaProvider(cfgOb, cachePath, useCache=True, rebuildFlag=schemaRebuildFlag) # self.__readBackCheck = readBackCheck self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" # # def load(self, databaseName, collectionName, loadType="full", documentList=None, indexAttributeList=None, keyNames=None, schemaLevel="full", addValues=None): """Driver method for loading MongoDb content - loadType: "full" or "replace" """ try: startTime = self.__begin(message="loading operation") # # optionsD = {} optionsD["collectionName"] = collectionName optionsD["databaseName"] = databaseName optionsD["readBackCheck"] = self.__readBackCheck optionsD["loadType"] = loadType optionsD["keyNames"] = keyNames # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # docList = documentList[:self. __documentLimit] if self.__documentLimit else documentList logger.debug("Full document list length %d limit %r", len(documentList), self.__documentLimit) numProc = self.__numProc chunkSize = self.__chunkSize if docList and self.__chunkSize < len( docList) else 0 # if addValues: try: for doc in docList: for k, v in addValues.items(): doc[k] = v except Exception as e: logger.error("Add values %r fails with %s", addValues, str(e)) # indAtList = indexAttributeList if indexAttributeList else [] bsonSchema = None if schemaLevel and schemaLevel in ["min", "full"]: bsonSchema = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="BSON", level=schemaLevel) logger.debug("Using schema validation for %r %r %r", databaseName, collectionName, schemaLevel) if loadType == "full": self.__removeCollection(databaseName, collectionName) ok = self.__createCollection(databaseName, collectionName, indAtList, bsonSchema=bsonSchema) logger.info("Collection %s create status %r", collectionName, ok) elif loadType == "append": # create only if object does not exist - ok = self.__createCollection(databaseName, collectionName, indexAttributeNames=indAtList, checkExists=True, bsonSchema=bsonSchema) logger.debug("Collection %s create status %r", collectionName, ok) # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numDocs = len(docList) logger.debug("Processing %d total documents", numDocs) numProc = min(numProc, numDocs) maxStepLength = self.__maxStepLength if numDocs > maxStepLength: numLists = int(numDocs / maxStepLength) subLists = [docList[i::numLists] for i in range(numLists)] else: subLists = [docList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # failList = [] for ii, subList in enumerate(subLists): logger.debug("Running outer subtask %d of %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optionsD) mpu.set(workerObj=self, workerMethod="loadWorker") ok, failListT, _, _ = mpu.runMulti(dataList=subList, numProc=numProc, numResults=1, chunkSize=chunkSize) failList.extend(failListT) logger.debug("Completed load with failing document list %r", failList) logger.debug("Document list length %d failed load list length %d", len(docList), len(failList)) # self.__end(startTime, "loading operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def loadWorker(self, dataList, procName, optionsD, workingDir): """Multi-proc worker method for MongoDb document loading -""" try: startTime = self.__begin(message=procName) readBackCheck = optionsD["readBackCheck"] loadType = optionsD["loadType"] collectionName = optionsD["collectionName"] databaseName = optionsD["databaseName"] keyNames = optionsD["keyNames"] # logger.debug("%s databaseName %s collectionName %s workingDir %s", procName, databaseName, collectionName, workingDir) # if dataList: ok, successList, failedList = self.__loadDocuments( databaseName, collectionName, dataList, loadType=loadType, readBackCheck=readBackCheck, keyNames=keyNames) # logger.debug( "%s database %s collection %s inputList length %d successList length %d failed %d", procName, databaseName, collectionName, len(dataList), len(successList), len(failedList), ) # self.__end(startTime, procName + " with status " + str(ok)) return successList, [], [] except Exception as e: logger.exception("Failing with %s", str(e)) return [], [], [] # -------------- -------------- -------------- -------------- -------------- -------------- -------------- # --- Supporting code follows --- # def __begin(self, message=""): startTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting %s at %s", message, ts) return startTime def __end(self, startTime, message=""): endTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) delta = endTime - startTime logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta) def __createCollection(self, dbName, collectionName, indexAttributeNames=None, checkExists=False, bsonSchema=None): """Create database and collection and optionally a primary index -""" try: logger.debug("Create database %s collection %s", dbName, collectionName) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if checkExists and mg.databaseExists( dbName) and mg.collectionExists( dbName, collectionName): ok1 = True else: ok1 = mg.createCollection(dbName, collectionName, bsonSchema=bsonSchema) ok2 = mg.databaseExists(dbName) ok3 = mg.collectionExists(dbName, collectionName) okI = True if indexAttributeNames: okI = mg.createIndex(dbName, collectionName, indexAttributeNames, indexName="primary", indexType="DESCENDING", uniqueFlag=False) return ok1 and ok2 and ok3 and okI # except Exception as e: logger.exception("Failing with %s", str(e)) return False def __removeCollection(self, dbName, collectionName): """Drop collection within database""" try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) # logger.debug("Remove collection database %s collection %s", dbName, collectionName) logger.debug("Starting databases = %r", mg.getDatabaseNames()) logger.debug("Starting collections = %r", mg.getCollectionNames(dbName)) ok = mg.dropCollection(dbName, collectionName) logger.debug("Databases = %r", mg.getDatabaseNames()) logger.debug("Post drop collections = %r", mg.getCollectionNames(dbName)) ok = mg.collectionExists(dbName, collectionName) logger.debug("Post drop collections = %r", mg.getCollectionNames(dbName)) return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def __loadDocuments(self, dbName, collectionName, docList, loadType="full", readBackCheck=False, keyNames=None): # # Load database/collection with input document list - # failList = [] rIdL = [] successList = [] logger.debug( "Loading dbName %s collectionName %s with document count %d keynames %r", dbName, collectionName, len(docList), keyNames) if keyNames: # map the document list to some document key if this is provided indD = {} indL = [] try: for ii, doc in enumerate(docList): dIdTup = self.__getKeyValues(doc, keyNames) indD[dIdTup] = ii indL = list(range(len(docList))) except Exception as e: logger.exception("Failing ii %d d %r with %s", ii, doc, str(e)) try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) # if loadType == "replace" and keyNames: dTupL = mg.deleteList(dbName, collectionName, docList, keyNames) logger.debug("Deleted document status %r", (dTupL, )) # rIdL = mg.insertList(dbName, collectionName, docList, keyNames=keyNames) logger.debug("Insert returns rIdL length %r", len(rIdL)) # --- # If there is a failure then determine the specific successes and failures - # successList = docList failList = [] if len(rIdL) != len(docList): if keyNames: successIndList = [] for rId in rIdL: rObj = mg.fetchOne(dbName, collectionName, "_id", rId) dIdTup = self.__getKeyValues(rObj, keyNames) successIndList.append(indD[dIdTup]) failIndList = list(set(indL) - set(successIndList)) failList = [docList[ii] for ii in failIndList] successList = [docList[ii] for ii in successIndList] else: # fail the whole batch if we don't have visibility into each document failList = docList successList = [] # rbStatus = True if readBackCheck and keyNames: # # Note that objects in docList are mutated by the insert operation with the additional key '_id', # hence, it is possible to compare the fetched object with the input object. # for ii, rId in enumerate(rIdL): rObj = mg.fetchOne(dbName, collectionName, "_id", rId) dIdTup = self.__getKeyValues(rObj, keyNames) jj = indD[dIdTup] if rObj != docList[jj]: rbStatus = False break # if readBackCheck and not rbStatus: return False, successList, failList # return len(rIdL) == len(docList), successList, failList except Exception as e: logger.exception("Failing %r %r (len=%d) %s with %s", dbName, collectionName, len(docList), keyNames, str(e)) return False, [], docList def __getKeyValues(self, dct, keyNames): """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation. Args: dct (dict): source dictionary object (nested) keyNames (list): list of dictionary keys in dot notatoin Returns: tuple: tuple of values corresponding to the input key names """ rL = [] try: for keyName in keyNames: rL.append(self.__getKeyValue(dct, keyName)) except Exception as e: logger.exception("Failing for key names %r with %s", keyNames, str(e)) return tuple(rL) def __getKeyValue(self, dct, keyName): """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested).""" try: kys = keyName.split(".") for key in kys: try: dct = dct[key] except KeyError: return None return dct except Exception as e: logger.exception("Failing for key %r with %s", keyName, str(e)) return None
def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__numProc = 2 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__outputPath = os.path.join(HERE, "test-output") self.__savedOutputPath = os.path.join(HERE, "test-saved-output") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE", sectionName=configName, default="local") self.__fileLimit = 100 if self.__discoveryMode == "local" else 10 self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__chemCompMockLen = 24 self.__pdbxMockLen = 30 # removes timestamped data items to allow diffs.) excludeExtras = ["rcsb_load_status"] # excludeExtras = [] # self.__verbose = True self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__exportFlag = True self.__diffFlag = False # self.__simpleTestCaseList = [ { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_no_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeCol, "styleType": "columnwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 0, }, ] # self.__fullTestCaseList = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, { "contentType": "bird_chem_comp_core", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": None, "rejectLength": 2, "excludeExtras": excludeExtras, }, ] # self.__fullTestCaseListA = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))