class UniProtCoreEtlWorker(object): """Prepare and load UniProt 'core' sequence reference data collections.""" def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False): self.__cfgOb = cfgOb self.__cachePath = cachePath self.__useCache = useCache self.__readBackCheck = readBackCheck self.__numProc = numProc self.__chunkSize = chunkSize self.__documentLimit = documentLimit # self.__resourceName = "MONGO_DB" self.__verbose = verbose self.__statusList = [] self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache) self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb) self.__valInst = None self.__doValidate = doValidate # def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp): try: sFlag = "Y" if status else "N" desp = DataExchangeStatus() desp.setStartTime(tS=startTimestamp) desp.setObject(databaseName, collectionName) desp.setStatus(updateId=updateId, successFlag=sFlag) desp.setEndTime() self.__statusList.append(desp.getStatus()) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False def __getReferenceSequenceProvider(self): """ """ try: rsaP = ReferenceSequenceAssignmentProvider( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", useCache=self.__useCache, cachePath=self.__cachePath, fetchLimit=self.__documentLimit, siftsAbbreviated="TEST", ) ok = rsaP.testCache() return ok, rsaP except Exception as e: logger.exception("Failing with %s", str(e)) return None def load(self, updateId, extResource, loadType="full"): """Load sequence reference data""" try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # dList = indexL = [] databaseName = collectionName = collectionVersion = None # if extResource == "UniProt": databaseName = "uniprot_core" # configName = self.__cfgOb.getDefaultSectionName() # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName())) # ok, rsP = self.__getReferenceSequenceProvider() if not ok: return False # dList = rsP.getDocuments() logger.info("Resource %r extracted mapped document length %d", extResource, len(dList)) logger.debug("Objects %r", dList[:2]) # cDL = self.__docHelper.getCollectionInfo(databaseName) collectionName = cDL[0]["NAME"] collectionVersion = cDL[0]["VERSION"] indexL = self.__docHelper.getDocumentIndexAttributes( collectionName, "primary") logger.info( "Database %r collection %r version %r index attributes %r", databaseName, collectionName, collectionVersion, indexL) addValues = {} else: logger.error("Unsupported external resource %r", extResource) # if self.__doValidate: self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full") for dObj in dList: self.__validateObj(databaseName, collectionName, dObj, label="Original") # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues) okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) return ok and okS except Exception as e: logger.exception("Failing with %s", str(e)) return False def getLoadStatus(self): return self.__statusList def __getValidator(self, databaseName, collectionName, schemaLevel="full"): # _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) # cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) logger.info("Fetch schema for %r %r validation level %r", databaseName, collectionName, schemaLevel) cD = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) valInst = Draft4Validator(cD, format_checker=FormatChecker()) return valInst def __validateObj(self, databaseName, collectionName, rObj, label=""): try: eCount = 0 tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous" for error in sorted(self.__valInst.iter_errors(rObj), key=str): logger.info( "Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message) logger.debug(">>> Failing object is %r", rObj) if "rcsb_uniprot_feature" in rObj: for dd in rObj["rcsb_uniprot_feature"]: if "feature_id" in dd: logger.info("feature_id %r", dd["feature_id"]) else: logger.info("no feature_id keys %r", sorted(dd.keys())) logger.info("description %r", dd["description"]) eCount += 1 except Exception as e: logger.exception("Validation failing %s", str(e)) return eCount
class SchemaProviderTests(unittest.TestCase): def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False) # self.__validationLevels = self.__cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") self.__encodingTypes = self.__cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # buildAll = True if buildAll: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") # else: self.__databaseNameList = self.__cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") self.__dataTypingList = self.__cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") # self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSchemaAccessDefault(self): for databaseName in self.__databaseNameList: cDL = self.__docHelper.getCollectionInfo(databaseName) for cD in cDL: collectionName = cD["NAME"] for encodingType in self.__encodingTypes: if encodingType.lower() == "rcsb": continue for level in self.__validationLevels: logger.debug("Loading ->%s %s %s %s", databaseName, collectionName, encodingType, level) sD = self.__schP.getJsonSchema( databaseName, collectionName, encodingType=encodingType, level=level) self.assertTrue(sD is not None)
class DocumentLoader(object): def __init__( self, cfgOb, cachePath, resourceName="MONGO_DB", numProc=4, chunkSize=15, documentLimit=None, verbose=False, readBackCheck=False, maxStepLength=2000, schemaRebuildFlag=False, ): self.__verbose = verbose # # Limit the load length of each file type for testing - Set to None to remove - self.__documentLimit = documentLimit self.__maxStepLength = maxStepLength # # Controls for multiprocessing execution - self.__numProc = numProc self.__chunkSize = chunkSize # self.__cfgOb = cfgOb self.__resourceName = resourceName # self.__cachePath = cachePath if cachePath else "." self.__schP = SchemaProvider(cfgOb, cachePath, useCache=True, rebuildFlag=schemaRebuildFlag) # self.__readBackCheck = readBackCheck self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s" # # def load(self, databaseName, collectionName, loadType="full", documentList=None, indexAttributeList=None, keyNames=None, schemaLevel="full", addValues=None): """Driver method for loading MongoDb content - loadType: "full" or "replace" """ try: startTime = self.__begin(message="loading operation") # # optionsD = {} optionsD["collectionName"] = collectionName optionsD["databaseName"] = databaseName optionsD["readBackCheck"] = self.__readBackCheck optionsD["loadType"] = loadType optionsD["keyNames"] = keyNames # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - # docList = documentList[:self. __documentLimit] if self.__documentLimit else documentList logger.debug("Full document list length %d limit %r", len(documentList), self.__documentLimit) numProc = self.__numProc chunkSize = self.__chunkSize if docList and self.__chunkSize < len( docList) else 0 # if addValues: try: for doc in docList: for k, v in addValues.items(): doc[k] = v except Exception as e: logger.error("Add values %r fails with %s", addValues, str(e)) # indAtList = indexAttributeList if indexAttributeList else [] bsonSchema = None if schemaLevel and schemaLevel in ["min", "full"]: bsonSchema = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="BSON", level=schemaLevel) logger.debug("Using schema validation for %r %r %r", databaseName, collectionName, schemaLevel) if loadType == "full": self.__removeCollection(databaseName, collectionName) ok = self.__createCollection(databaseName, collectionName, indAtList, bsonSchema=bsonSchema) logger.info("Collection %s create status %r", collectionName, ok) elif loadType == "append": # create only if object does not exist - ok = self.__createCollection(databaseName, collectionName, indexAttributeNames=indAtList, checkExists=True, bsonSchema=bsonSchema) logger.debug("Collection %s create status %r", collectionName, ok) # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- - numDocs = len(docList) logger.debug("Processing %d total documents", numDocs) numProc = min(numProc, numDocs) maxStepLength = self.__maxStepLength if numDocs > maxStepLength: numLists = int(numDocs / maxStepLength) subLists = [docList[i::numLists] for i in range(numLists)] else: subLists = [docList] # if subLists: logger.debug( "Starting with numProc %d outer subtask count %d subtask length ~ %d", numProc, len(subLists), len(subLists[0])) # failList = [] for ii, subList in enumerate(subLists): logger.debug("Running outer subtask %d of %d length %d", ii + 1, len(subLists), len(subList)) # mpu = MultiProcUtil(verbose=True) mpu.setOptions(optionsD=optionsD) mpu.set(workerObj=self, workerMethod="loadWorker") ok, failListT, _, _ = mpu.runMulti(dataList=subList, numProc=numProc, numResults=1, chunkSize=chunkSize) failList.extend(failListT) logger.debug("Completed load with failing document list %r", failList) logger.debug("Document list length %d failed load list length %d", len(docList), len(failList)) # self.__end(startTime, "loading operation with status " + str(ok)) # return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def loadWorker(self, dataList, procName, optionsD, workingDir): """Multi-proc worker method for MongoDb document loading -""" try: startTime = self.__begin(message=procName) readBackCheck = optionsD["readBackCheck"] loadType = optionsD["loadType"] collectionName = optionsD["collectionName"] databaseName = optionsD["databaseName"] keyNames = optionsD["keyNames"] # logger.debug("%s databaseName %s collectionName %s workingDir %s", procName, databaseName, collectionName, workingDir) # if dataList: ok, successList, failedList = self.__loadDocuments( databaseName, collectionName, dataList, loadType=loadType, readBackCheck=readBackCheck, keyNames=keyNames) # logger.debug( "%s database %s collection %s inputList length %d successList length %d failed %d", procName, databaseName, collectionName, len(dataList), len(successList), len(failedList), ) # self.__end(startTime, procName + " with status " + str(ok)) return successList, [], [] except Exception as e: logger.exception("Failing with %s", str(e)) return [], [], [] # -------------- -------------- -------------- -------------- -------------- -------------- -------------- # --- Supporting code follows --- # def __begin(self, message=""): startTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) logger.debug("Starting %s at %s", message, ts) return startTime def __end(self, startTime, message=""): endTime = time.time() ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime()) delta = endTime - startTime logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta) def __createCollection(self, dbName, collectionName, indexAttributeNames=None, checkExists=False, bsonSchema=None): """Create database and collection and optionally a primary index -""" try: logger.debug("Create database %s collection %s", dbName, collectionName) with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if checkExists and mg.databaseExists( dbName) and mg.collectionExists( dbName, collectionName): ok1 = True else: ok1 = mg.createCollection(dbName, collectionName, bsonSchema=bsonSchema) ok2 = mg.databaseExists(dbName) ok3 = mg.collectionExists(dbName, collectionName) okI = True if indexAttributeNames: okI = mg.createIndex(dbName, collectionName, indexAttributeNames, indexName="primary", indexType="DESCENDING", uniqueFlag=False) return ok1 and ok2 and ok3 and okI # except Exception as e: logger.exception("Failing with %s", str(e)) return False def __removeCollection(self, dbName, collectionName): """Drop collection within database""" try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) # logger.debug("Remove collection database %s collection %s", dbName, collectionName) logger.debug("Starting databases = %r", mg.getDatabaseNames()) logger.debug("Starting collections = %r", mg.getCollectionNames(dbName)) ok = mg.dropCollection(dbName, collectionName) logger.debug("Databases = %r", mg.getDatabaseNames()) logger.debug("Post drop collections = %r", mg.getCollectionNames(dbName)) ok = mg.collectionExists(dbName, collectionName) logger.debug("Post drop collections = %r", mg.getCollectionNames(dbName)) return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False def __loadDocuments(self, dbName, collectionName, docList, loadType="full", readBackCheck=False, keyNames=None): # # Load database/collection with input document list - # failList = [] rIdL = [] successList = [] logger.debug( "Loading dbName %s collectionName %s with document count %d keynames %r", dbName, collectionName, len(docList), keyNames) if keyNames: # map the document list to some document key if this is provided indD = {} indL = [] try: for ii, doc in enumerate(docList): dIdTup = self.__getKeyValues(doc, keyNames) indD[dIdTup] = ii indL = list(range(len(docList))) except Exception as e: logger.exception("Failing ii %d d %r with %s", ii, doc, str(e)) try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) # if loadType == "replace" and keyNames: dTupL = mg.deleteList(dbName, collectionName, docList, keyNames) logger.debug("Deleted document status %r", (dTupL, )) # rIdL = mg.insertList(dbName, collectionName, docList, keyNames=keyNames) logger.debug("Insert returns rIdL length %r", len(rIdL)) # --- # If there is a failure then determine the specific successes and failures - # successList = docList failList = [] if len(rIdL) != len(docList): if keyNames: successIndList = [] for rId in rIdL: rObj = mg.fetchOne(dbName, collectionName, "_id", rId) dIdTup = self.__getKeyValues(rObj, keyNames) successIndList.append(indD[dIdTup]) failIndList = list(set(indL) - set(successIndList)) failList = [docList[ii] for ii in failIndList] successList = [docList[ii] for ii in successIndList] else: # fail the whole batch if we don't have visibility into each document failList = docList successList = [] # rbStatus = True if readBackCheck and keyNames: # # Note that objects in docList are mutated by the insert operation with the additional key '_id', # hence, it is possible to compare the fetched object with the input object. # for ii, rId in enumerate(rIdL): rObj = mg.fetchOne(dbName, collectionName, "_id", rId) dIdTup = self.__getKeyValues(rObj, keyNames) jj = indD[dIdTup] if rObj != docList[jj]: rbStatus = False break # if readBackCheck and not rbStatus: return False, successList, failList # return len(rIdL) == len(docList), successList, failList except Exception as e: logger.exception("Failing %r %r (len=%d) %s with %s", dbName, collectionName, len(docList), keyNames, str(e)) return False, [], docList def __getKeyValues(self, dct, keyNames): """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation. Args: dct (dict): source dictionary object (nested) keyNames (list): list of dictionary keys in dot notatoin Returns: tuple: tuple of values corresponding to the input key names """ rL = [] try: for keyName in keyNames: rL.append(self.__getKeyValue(dct, keyName)) except Exception as e: logger.exception("Failing for key names %r with %s", keyNames, str(e)) return tuple(rL) def __getKeyValue(self, dct, keyName): """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested).""" try: kys = keyName.split(".") for key in kys: try: dct = dct[key] except KeyError: return None return dct except Exception as e: logger.exception("Failing for key %r with %s", keyName, str(e)) return None