class SchemaDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__numProc = 2 # self.__fileLimit = 200 self.__fileLimit = None self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName) # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__verbose = True # self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files") self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files") self.__export = True # #self.__extraOpts = None # The following for extended parent/child info - self.__extraOpts = 'addParentRefs|addPrimaryKey' # self.__alldatabaseNameD = { "ihm_dev": ["ihm_dev"], "pdbx": ["pdbx", "pdbx_ext"], "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird": ["bird"], "bird_family": ["family"], "chem_comp": ["chem_comp"], "bird_chem_comp": ["bird_chem_comp"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__databaseNameD = { "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]} # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]} # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]} self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsRepo(self): # schemaLevel = "min" schemaLevel = "full" inputPathList = None eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertLessEqual(eCount, 1) @unittest.skip("Disable troubleshooting test") def testValidateOptsList(self): schemaLevel = "min" inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list") # inputPathList = glob.glob(self.__testDirPath + "/*.cif") if not inputPathList: return True databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]} for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)): if ii < 5: continue eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmRepo(self): schemaLevel = "min" inputPathList = None self.__export = True databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmList(self): #schemaLevel = "full" schemaLevel = "min" inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif") if not inputPathList: return True #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None): # eCount = 0 for databaseName in databaseNameD: mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes) for collectionName in databaseNameD[databaseName]: cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts) # dL, cnL = self.__testPrepDocumentsFromContainers( pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes ) # Raises exceptions for schema compliance. try: Draft4Validator.check_schema(cD) except Exception as e: logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e)) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 #for error in sorted(valInfo.iter_errors(dD), key=str): # logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message) # logger.debug("Failing document %d : %r", ii, list(dD.items())) # eCount += 1 # cCount += 1 #if cCount > 0: # logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount) except Exception as e: logger.exception("Validation processing error %s", str(e)) return eCount def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None): """Test case - create loadable PDBx data from repository files """ try: sd, _, _, _ = self.__schP.getSchemaInfo(databaseName) # dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False) dictApi = dP.getApiByName(databaseName) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") #self.__mU.doExport(savePath, [container], fmt="mmcif") # tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, containerNameList, _ = sdp.processDocuments( containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName ) docList = sdp.addDocumentPrivateAttributes(docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName) # mergeS = "-".join(mergeContentTypes) if mergeContentTypes else "" if self.__export and docList: # for ii, doc in enumerate(docList[:1]): for ii, doc in enumerate(docList): cn = containerNameList[ii] fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS)) self.__mU.doExport(fp, [doc], fmt="json", indent=3) logger.debug("Exported %r", fp) # return docList, containerNameList except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDefDataPrepTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefDataPrepTests, self).__init__(methodName) self.__loadPathList = [] self.__verbose = True def setUp(self): self.__numProc = 2 self.__fileLimit = 100 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__outputPath = os.path.join(HERE, "test-output") self.__savedOutputPath = os.path.join(HERE, "test-saved-output") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__chemCompMockLen = 24 self.__pdbxMockLen = 85 # removes timestamped data items to allow diffs.) excludeExtras = ["rcsb_load_status"] # excludeExtras = [] # self.__verbose = True self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__exportFlag = True self.__diffFlag = False # self.__simpleTestCaseList = [ { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_no_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeCol, "styleType": "columnwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 5, }, ] # self.__fullTestCaseList = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 5, "excludeExtras": excludeExtras, }, { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": None, "rejectLength": 5, "excludeExtras": excludeExtras, }, { "contentType": "bird_chem_comp_core", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": None, "rejectLength": 2, "excludeExtras": excludeExtras, }, ] # self.__fullTestCaseListA = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 5, "excludeExtras": excludeExtras, }, ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __timeStep(self, msg): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", msg, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSimpleSchemaDefDataPrep(self): for tcD in self.__simpleTestCaseList: self.__simpleSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], tcD["mockLength"], rejectLength=tcD["rejectLength"], mergeContentTypes=tcD["mergeContentTypes"] ) def testFullSchemaDefDataPrep(self): for tcD in self.__fullTestCaseList: self.__fullSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], tcD["mockLength"], rejectLength=tcD["rejectLength"], mergeContentTypes=tcD["mergeContentTypes"], excludeExtras=tcD["excludeExtras"], ) def __simpleSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None): """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: dataSelectors = dataSelectors if dataSelectors else ["PUBLIC_RELEASE"] dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList(contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType, dataTyping="ANY") dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) # logger.debug("For %s mock length %d length of path list %d\n", contentType, mockLength, len(inputPathList)) self.assertEqual(len(inputPathList), mockLength) tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments(inputPathList, styleType=styleType, filterType=filterType, dataSelectors=dataSelectors) logger.debug("For %s mock length %d reject length %d length of tddl list %d\n", contentType, mockLength, rejectLength, len(tableDataDictList)) self.assertEqual(len(tableDataDictList), mockLength - rejectLength) self.assertEqual(len(containerNameList), mockLength - rejectLength) if rejectList: logger.debug("For %s rejecting components %r", contentType, rejectList) # self.assertEqual(len(rejectList), rejectLength) fName = "simple-prep-%s-%s.json" % (contentType, styleType) if self.__exportFlag: fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, tableDataDictList, fmt="json", indent=3) if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(tableDataDictList)) # jD = diff(refDocList, tableDataDictList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, styleType, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __logDocumentOrder(self, docList): for doc in docList: logger.debug("keys %r", list(doc.keys())) def __filterDocuments(self, docList, excludeList=None): excludeList = excludeList if excludeList else [] for doc in docList: for excl in excludeList: if excl in doc: del doc[excl] def __fullSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None, excludeExtras=None): """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: excludeExtras = excludeExtras if excludeExtras else [] _ = mockLength _ = rejectLength dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList(contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(databaseName=contentType, dataTyping="ANY") # dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName(contentType) # rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) # for collectionName in collectionNameList: tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, _, _ = sdp.processDocuments( containerList, styleType=styleType, sliceFilter=sliceFilter, filterType=filterType, dataSelectors=dataSelectors, collectionName=collectionName ) docList = sdp.addDocumentPrivateAttributes(docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName) # Special exclusions for the test harness. (removes timestamped data items to allow diffs.) self.__filterDocuments(docList, excludeExtras) mergeS = "-".join(mergeContentTypes) if mergeContentTypes else "" fName = "full-prep-%s-%s-%s-%s.json" % (contentType, collectionName, mergeS, styleType) if self.__exportFlag: self.__logDocumentOrder(docList) fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, docList, fmt="json", indent=3) logger.debug("Exported %r", fPath) # if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(docList)) logger.debug("For %s %s len refDocList %d", contentType, collectionName, len(refDocList)) logger.debug("For %s %s len docList %d", contentType, collectionName, len(docList)) jD = diff(refDocList, docList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, collectionName, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ScanRepoUtil(object): """Tools for for scanning repositories and collecting coverage and type data information. """ def __init__(self, cfgOb, attributeDataTypeD=None, numProc=4, chunkSize=15, fileLimit=None, maxStepLength=2000, workPath=None): """ Args: cfgOb (object): Configuration object (ConfigUtil) attributeDataTypeD dictPath (str): Path to supporting data dictionary numProc (int, optional): Number of parallel worker processes used. chunkSize (int, optional): Size of files processed in a single multi-proc process fileLimit (int, optional): maximum file scanned or None for no limit mockTopPath (str, optional): Path to directory containing mock repositories or None maxStepLength (int, optional): maximum number of multi-proc runs to perform """ # self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {} # Limit the load length of each file type for testing - Set to None to remove - self.__fileLimit = fileLimit self.__maxStepLength = maxStepLength # # Controls for multiprocessing execution - self.__numProc = numProc self.__chunkSize = chunkSize # self.__cfgOb = cfgOb # self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" self.__workPath = workPath self.__mU = MarshalUtil(workPath=self.__workPath) self.__rpP = RepositoryProvider(self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath) def scanContentType(self, contentType, mergeContentTypes=None, scanType="full", inputPathList=None, scanDataFilePath=None, failedFilePath=None, saveInputFileListPath=None): """Driver method for repository scan operation Args: contentType (str): one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx' scanType (str, optional): 'full' [or 'incr' to be supported] inputPathList (list, optional): list of input file paths to scan scanDataFilePath (str, optional): file path for serialized scan data (Pickle format) failedFilePath (str, optional): file path for list of files that fail scanning operation saveInputFileListPath str, optional): Path to store file path list that is scanned Returns: bool: True for success or False otherwise """ try: startTime = self.__begin(message="scanning operation") # locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, inputPathList=inputPathList, mergeContentTypes=mergeContentTypes) # if saveInputFileListPath: self.__mU.doExport(saveInputFileListPath, self.__rpP.getLocatorPaths(locatorObjList), fmt="list") logger.debug("Saving %d paths in %s", len(locatorObjList), saveInputFileListPath) # optD = {} optD["contentType"] = contentType optD["logSize"] = True optD["scanType"] = scanType # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # numProc = self.__numProc chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len( locatorObjList) else 0 # # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numPaths = len(locatorObjList) logger.debug("Processing %d total paths", numPaths) numProc = min(numProc, numPaths) maxStepLength = self.__maxStepLength if numPaths > maxStepLength: numLists = int(numPaths / maxStepLength) subLists = [ locatorObjList[i::numLists] for i in range(numLists) ] else: subLists = [locatorObjList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # numResults = 1 failList = [] retLists = [[] for ii in range(numResults)] diagList = [] for ii, subList in enumerate(subLists): logger.debug("Running outer subtask %d or %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optD) mpu.set(workerObj=self, workerMethod="scanWorker") ok, failListT, retListsT, diagListT = mpu.runMulti( dataList=subList, numProc=numProc, numResults=numResults, chunkSize=chunkSize) failList.extend(failListT) # retLists is a list of lists - for _ in range(numResults): retLists[ii].extend(retListsT[ii]) diagList.extend(diagListT) logger.debug("Scan failed path list %r", failList) logger.debug( "Scan path list success length %d load list failed length %d", len(locatorObjList), len(failList)) logger.debug("Returned metadata length %r", len(retLists[0])) # if failedFilePath and failList: wOk = self.__mU.doExport(failedFilePath, self.__rpP.getLocatorPaths(failList), fmt="list") logger.debug("Writing scan failure path list to %s status %r", failedFilePath, wOk) # if scanType == "incr": scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle", default=None) logger.debug("Imported scan data with keys %r", list(scanDataD.keys())) else: scanDataD = {} # if scanDataFilePath and retLists[0]: for ssTup in retLists[0]: cId = ssTup.containerId if scanType == "full" and cId in scanDataD: logger.error("Duplicate container id %s in %r and %r", cId, ssTup.fromPath, scanDataD[cId].fromPath) # scanDataD[cId] = ssTup ok = self.__mU.doExport(scanDataFilePath, scanDataD, fmt="pickle") tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") ok = tscanDataD == scanDataD self.__end(startTime, "scanning operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def evalScan(self, scanDataFilePath, evalJsonFilePath, evalType="data_type"): scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle") if evalType in ["data_type"]: rD = self.__evalScanDataType(scanDataD) elif evalType in ["data_coverage"]: rD = self.__evalScanDataCoverage(scanDataD) else: logger.debug("Unknown evalType %r", evalType) ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json") return ok def __evalScanDataType(self, scanDataD): """ ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec') ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict') """ # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count} sD = {} for cId in scanDataD: ssTup = scanDataD[cId] dD = ssTup.scanCategoryDict for catName in dD: if catName not in sD: sD[catName] = {} for svTup in dD[catName]: if svTup.atName not in sD[catName]: sD[catName][svTup.atName] = { "minWidth": svTup.minWidth, "maxWidth": svTup.maxWidth, "minPrec": svTup.minPrec, "maxPrec": svTup.maxPrec, "count": 1 } continue sD[catName][svTup.atName]["minWidth"] = min( sD[catName][svTup.atName]["minWidth"], svTup.minWidth) sD[catName][svTup.atName]["maxWidth"] = max( sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth) sD[catName][svTup.atName]["minPrec"] = min( sD[catName][svTup.atName]["minPrec"], svTup.minPrec) sD[catName][svTup.atName]["maxPrec"] = max( sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec) sD[catName][svTup.atName]["count"] += 1 return sD def __evalScanDataCoverage(self, scanDataD): """ ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec') ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict') """ # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]} sD = {} for cId in scanDataD: ssTup = scanDataD[cId] dD = ssTup.scanCategoryDict for catName in dD: if catName not in sD: sD[catName] = {} for svTup in dD[catName]: if svTup.atName not in sD[catName]: sD[catName][svTup.atName] = { "count": 0, "instances": [] } sD[catName][svTup.atName]["instances"].append( svTup.containerId) sD[catName][svTup.atName]["count"] += 1 return sD def scanWorker(self, dataList, procName, optionsD, workingDir): """ Multi-proc worker method for scanning repository data files- """ try: _ = workingDir startTime = self.__begin(message=procName) # Recover common options scanType = optionsD["scanType"] contentType = optionsD["contentType"] # successList = [] retList = [] containerList = self.__getContainerList(dataList) for container in containerList: ret = self.__scanContainer(container) successList.append(ret.fromPath) retList.append(ret) # logger.debug( "%s scanType %s contentType %spathlist length %d containerList length %d", procName, scanType, contentType, len(dataList), len(containerList)) ok = len(successList) == len(dataList) # self.__end(startTime, procName + " with status " + str(ok)) return successList, retList, [] except Exception as e: logger.error("Failing with dataList %r", dataList) logger.exception("Failing with %s", str(e)) return [], [], [] def __getContainerList(self, locatorObjList): """ """ utcnow = datetime.datetime.utcnow() ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S") cL = [] myContainerList = self.__rpP.getContainerList(locatorObjList) for loc in locatorObjList: myContainerList = self.__rpP.getContainerList([loc]) lPathL = self.__rpP.getLocatorPaths([loc]) for cA in myContainerList: dc = DataCategory("rcsb_load_status", ["name", "load_date", "locator"], [[cA.getName(), ts, lPathL[0]]]) logger.debug("data category %r", dc) cA.append(dc) cL.append(cA) return cL def __scanContainer(self, container): """ Scan the input container for Get the file name - """ cName = container.getName() loadStatusObj = container.getObj("rcsb_load_status") lName = loadStatusObj.getValue(attributeName="name", rowIndex=0) lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0) lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0) # oD = {} for objName in container.getObjNameList(): if objName == "rcsb_load_status": continue obj = container.getObj(objName) afD = self.__attributeDataTypeD[ objName] if objName in self.__attributeDataTypeD else {} atNameList = obj.getAttributeList() wMin = {atName: 100000 for atName in atNameList} wMax = {atName: -1 for atName in atNameList} pMin = {atName: 100000 for atName in atNameList} pMax = {atName: -1 for atName in atNameList} for row in obj.getRowList(): for ii, val in enumerate(row): valLen = len(val) if (valLen == 0) or (val == "?") or (val == "."): continue atName = atNameList[ii] wMin[atName] = min(wMin[atName], valLen) wMax[atName] = max(wMax[atName], valLen) if atName in afD and afD[atName] == "float": vPrec = 0 try: fields = val.split(".") vPrec = len(fields[1]) pMin[atName] = min(pMin[atName], vPrec) pMax[atName] = max(pMax[atName], vPrec) except Exception as e: logger.debug("Failed to process float %s %r %r %s", atName, val, vPrec, str(e)) pMin[atName] = 0 pMax[atName] = 0 logger.debug("Got float for %s %r %r", atName, val, vPrec) else: pMin[atName] = 0 pMax[atName] = 0 # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec oD[objName] = [ ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN], pMax[atN]) for atN in wMax if wMax[atN] != -1 ] # ScanSummary containerId, fromPath, scanCategoryDict # ret = ScanSummary(lName, lFilePath, lDate, oD) # return ret def __begin(self, message=""): startTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting %s at %s", message, ts) return startTime def __end(self, startTime, message=""): endTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) delta = endTime - startTime logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)
class SchemaDefLoader(object): """ Map PDBx/mmCIF instance data to SQL loadable data using external schema definition. """ def __init__(self, cfgOb, schemaDefObj, cfgSectionName="site_info_configuration", dbCon=None, cachePath=".", workPath=".", cleanUp=False, warnings="default", verbose=True): self.__verbose = verbose self.__debug = False self.__cfgOb = cfgOb sectionName = cfgSectionName self.__sD = schemaDefObj # self.__dbCon = dbCon self.__cachePath = cachePath self.__workPath = workPath self.__pathList = [] self.__cleanUp = cleanUp # self.__colSep = "&##&\t" self.__rowSep = "$##$\n" # # self.__fTypeRow = "skip-max-width" self.__fTypeCol = "skip-max-width" # self.__warningAction = warnings dtf = DataTransformFactory(schemaDefAccessObj=self.__sD, filterType=self.__fTypeRow) self.__sdp = SchemaDefDataPrep(schemaDefAccessObj=self.__sD, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, cachePath=self.__cachePath) # schemaName = self.__sD.getName() modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=sectionName) dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName(schemaName) rP = DictMethodResourceProvider(self.__cfgOb, cachePath=self.__cachePath) self.__dmh = DictMethodRunner(dictApi, modulePathMap=modulePathMap, resourceProvider=rP) def setWarning(self, action): if action in ["error", "ignore", "default"]: self.__warningAction = action return True else: self.__warningAction = "default" return False def setDelimiters(self, colSep=None, rowSep=None): """ Set column and row delimiters for intermediate data files used for batch-file loading operations. """ self.__colSep = colSep if colSep is not None else "&##&\t" self.__rowSep = rowSep if rowSep is not None else "$##$\n" return True def load(self, inputPathList=None, containerList=None, loadType="batch-file", deleteOpt=None, tableIdSkipD=None): """ Load data for each table defined in the current schema definition object. Data are extracted from the input file list. Data source options: inputPathList = [<full path of target input file>, ....] or containerList = [ data container, ...] loadType = ['batch-file' | 'batch-insert'] deleteOpt = 'selected' | 'all' tableIdSkipD - searchable container with tableIds to be skipped on loading - Loading is performed using the current database server connection. Intermediate data files for 'batch-file' loading are created in the current working path. Returns True for success or False otherwise. """ tableIdSkipD = tableIdSkipD if tableIdSkipD is not None else {} if inputPathList is not None: cL = self.__rpP.getContainerList(inputPathList) # # Apply dynamic methods here - # for cA in cL: self.__dmh.apply(cA) tableDataDict, containerNameList = self.__sdp.process(cL) elif containerList is not None: tableDataDict, containerNameList = self.__sdp.process(containerList) # # if loadType in ["batch-file", "batch-file-append"]: append = True if loadType == "batch-file-append" else False exportList = self.__exportTdd(tableDataDict, colSep=self.__colSep, rowSep=self.__rowSep, append=append) for tableId, loadPath in exportList: if tableId in tableIdSkipD: continue self.__batchFileImport(tableId, loadPath, sqlFilePath=None, containerNameList=containerNameList, deleteOpt=deleteOpt) if self.__cleanUp: self.__cleanUpFile(loadPath) return True elif loadType == "batch-insert": for tableId, rowList in tableDataDict.items(): if tableId in tableIdSkipD: continue if deleteOpt in ["all", "selected"] or rowList: self.__batchInsertImport(tableId, rowList=rowList, containerNameList=containerNameList, deleteOpt=deleteOpt) return True else: pass return False def __cleanUpFile(self, filePath): try: os.remove(filePath) except Exception: pass def makeLoadFilesMulti(self, dataList, procName, optionsD, workingDir): """ Create a loadable data file for each table defined in the current schema definition object. Data is extracted from the input file list. Load files are creating in the current working path. Return the containerNames for the input path list, and path list for load files that are created. """ _ = workingDir try: pn = procName.split("-")[-1] except Exception: pn = procName exportFormat = optionsD["exportFormat"] if "exportFormat" in optionsD else "tdd" r1, r2 = self.makeLoadFiles(inputPathList=dataList, partName=pn, exportFormat=exportFormat) return dataList, r1, r2, [] def makeLoadFiles(self, inputPathList, append=False, partName="1", exportFormat="tdd"): """ Create a loadable data file for each table defined in the current schema definition object. Data is extracted from the input file list. Load files are created in the current working path. Return the containerNames for the input path list, and path list for load files that are created. """ cL = self.__rpP.getContainerList(inputPathList) for cA in cL: self.__dmh.apply(cA) tableDataDict, containerNameList = self.__sdp.process(cL) if exportFormat == "tdd": return containerNameList, self.__exportTdd(tableDataDict, colSep=self.__colSep, rowSep=self.__rowSep, append=append, partName=partName) elif exportFormat == "csv": return containerNameList, self.__exportCsv(tableDataDict, append=append, partName=partName) else: return [], [] def __exportCsv(self, tableDict, append=False, partName="1"): """ """ modeOpt = "a" if append else "w" exportList = [] for tableId, rowList in tableDict.items(): if not rowList: continue tObj = self.__sD.getSchemaObject(tableId) schemaAttributeIdList = tObj.getAttributeIdList() attributeNameList = tObj.getAttributeNameList() # fn = os.path.join(self.__workPath, tableId + "-" + partName + ".csv") with open(fn, modeOpt, newline="") as ofh: csvWriter = csv.writer(ofh) csvWriter.writerow(attributeNameList) for rD in rowList: csvWriter.writerow([rD[aId] for aId in schemaAttributeIdList]) exportList.append((tableId, fn)) return exportList def __exportTdd(self, tableDict, colSep="&##&\t", rowSep="$##$\n", append=False, partName="1"): modeOpt = "a" if append else "w" exportList = [] for tableId, rowList in tableDict.items(): tObj = self.__sD.getSchemaObject(tableId) schemaAttributeIdList = tObj.getAttributeIdList() # if rowList: fn = os.path.join(self.__workPath, tableId + "-" + partName + ".tdd") ofh = open(fn, modeOpt) for rD in rowList: # logger.info("%r" % colSep.join([str(rD[aId]) for aId in schemaAttributeIdList])) ofh.write("%s%s" % (colSep.join([str(rD[aId]) for aId in schemaAttributeIdList]), rowSep)) ofh.close() exportList.append((tableId, fn)) return exportList def loadBatchFiles(self, loadList=None, containerNameList=None, deleteOpt=None): """ Load data for each table defined in the current schema definition object using Data source options: loadList = [(tableId, <full path of load file), ....] containerNameList = [ data namecontainer, ...] deleteOpt = 'selected' | 'all','truncate' Loading is performed using the current database server connection. Returns True for success or False otherwise. """ # startTime = time.time() for tableId, loadPath in loadList: ok = self.__batchFileImport(tableId, loadPath, sqlFilePath=None, containerNameList=containerNameList, deleteOpt=deleteOpt) if not ok: break if self.__cleanUp: self.__cleanUpFile(loadPath) # endTime = time.time() logger.debug("Completed with status %r at %s (%.3f seconds)\n", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ok def delete(self, tableId, containerNameList=None, deleteOpt="all"): # startTime = time.time() sqlCommandList = self.__getSqlDeleteList(tableId, containerNameList=containerNameList, deleteOpt=deleteOpt) myQ = MyDbQuery(dbcon=self.__dbCon, verbose=self.__verbose) myQ.setWarning(self.__warningAction) ret = myQ.sqlCommand(sqlCommandList=sqlCommandList) # # endTime = time.time() logger.debug("Delete table %s server returns %r\n", tableId, ret) logger.debug("Completed at %s (%.3f seconds)\n", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ret def __getSqlDeleteList(self, tableId, containerNameList=None, deleteOpt="all"): """ Return the SQL delete commands for the input table and container name list. """ databaseName = self.__sD.getDatabaseName() sqlGen = SqlGenAdmin(self.__verbose) databaseName = self.__sD.getDatabaseName() tableDefObj = self.__sD.getSchemaObject(tableId) tableName = tableDefObj.getName() sqlDeleteList = [] if deleteOpt in ["selected", "delete"] and containerNameList is not None: deleteAttributeName = tableDefObj.getDeleteAttributeName() sqlDeleteList = sqlGen.deleteFromListSQL(databaseName, tableName, deleteAttributeName, containerNameList, chunkSize=50) elif deleteOpt in ["all", "truncate"]: sqlDeleteList = [sqlGen.truncateTableSQL(databaseName, tableName)] logger.debug("Delete SQL for %s : %r\n", tableId, sqlDeleteList) return sqlDeleteList def __batchFileImport(self, tableId, tableLoadPath, sqlFilePath=None, containerNameList=None, deleteOpt="all"): """ Batch load the input table using data in the input loadable data file. if sqlFilePath is provided then any generated SQL commands are preserved in this file. deleteOpt None|'selected'| 'all' or 'truncate' """ startTime = time.time() databaseName = self.__sD.getDatabaseName() sqlGen = SqlGenAdmin(self.__verbose) databaseName = self.__sD.getDatabaseName() tableDefObj = self.__sD.getSchemaObject(tableId) # tableName = tableDefObj.getName() # if deleteOpt: sqlCommandList = self.__getSqlDeleteList(tableId, containerNameList=containerNameList, deleteOpt=deleteOpt) else: sqlCommandList = [] if os.access(tableLoadPath, os.R_OK): tableDefObj = self.__sD.getSchemaObject(tableId) sqlCommandList.append(sqlGen.importTable(databaseName, tableDefObj, importPath=tableLoadPath)) if self.__verbose: logger.debug("SQL import command\n%s\n", sqlCommandList) # if sqlFilePath is not None: try: with open(sqlFilePath, "w") as ofh: ofh.write("%s" % "\n".join(sqlCommandList)) except Exception: pass # myQ = MyDbQuery(dbcon=self.__dbCon, verbose=self.__verbose) myQ.setWarning(self.__warningAction) ret = myQ.sqlCommand(sqlCommandList=sqlCommandList) # # endTime = time.time() logger.debug("Table %s server returns %r\n", tableId, ret) logger.debug("Completed at %s (%.3f seconds)\n", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ret def loadBatchData(self, tableId, rowList=None, containerNameList=None, deleteOpt="selected"): return self.__batchInsertImport(tableId, rowList=rowList, containerNameList=containerNameList, deleteOpt=deleteOpt) def __batchInsertImport(self, tableId, rowList=None, containerNameList=None, deleteOpt="selected"): """ Load the input table using batch inserts of the input list of dictionaries (i.e. d[attributeId]=value). The containerNameList corresponding to the data within loadable data in rowList can be provided if 'selected' deletions are to performed prior to the the batch data inserts. deleteOpt = ['selected','all'] where 'selected' deletes rows corresponding to the input container list before insert. The 'all' options truncates the table prior to insert. Deletions are performed in the absence of loadable data. """ startTime = time.time() myQ = MyDbQuery(dbcon=self.__dbCon, verbose=self.__verbose) myQ.setWarning(self.__warningAction) sqlGen = SqlGenAdmin(self.__verbose) # databaseName = self.__sD.getDatabaseName() tableDefObj = self.__sD.getSchemaObject(tableId) tableName = tableDefObj.getName() tableAttributeIdList = tableDefObj.getAttributeIdList() tableAttributeNameList = tableDefObj.getAttributeNameList() # sqlDeleteList = None if deleteOpt in ["selected", "delete"] and containerNameList is not None: deleteAttributeName = tableDefObj.getDeleteAttributeName() sqlDeleteList = sqlGen.deleteFromListSQL(databaseName, tableName, deleteAttributeName, containerNameList, chunkSize=10) if self.__verbose: logger.debug("Delete SQL for %s : %r\n", tableId, sqlDeleteList) elif deleteOpt in ["all", "truncate"]: sqlDeleteList = [sqlGen.truncateTableSQL(databaseName, tableName)] sqlInsertList = [] for row in rowList: vList = [] aList = [] for tid, nm in zip(tableAttributeIdList, tableAttributeNameList): # if len(row[id]) > 0 and row[id] != r'\N': if row[tid] is not None and row[tid] != r"\N": vList.append(row[tid]) aList.append(nm) sqlInsertList.append((sqlGen.insertTemplateSQL(databaseName, tableName, aList), vList)) ret = myQ.sqlBatchTemplateCommand(sqlInsertList, prependSqlList=sqlDeleteList) if ret: logger.debug("Batch insert completed for table %s rows %d\n", tableName, len(sqlInsertList)) else: logger.error("Batch insert fails for table %s length %d\n", tableName, len(sqlInsertList)) endTime = time.time() if self.__verbose: logger.debug("Completed at %s (%.3f seconds)\n", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ret def __deleteFromTable(self, tableIdList, deleteValue): """ Delete data from the input table list where the schema table delete attribute has the input value "deleteValue". """ databaseName = self.__sD.getDatabaseName() sqlList = [] sqlGen = SqlGenAdmin(self.__verbose) for tableId in tableIdList: tableName = self.__sD.getSchemaName(tableId) tableDefObj = self.__sD.getSchemaObject(tableId) atName = tableDefObj.getDeleteAttributeName() sqlTemp = sqlGen.deleteTemplateSQL(databaseName, tableName, [atName]) sqlList.append(sqlTemp % deleteValue) # return sqlList