def testLoadCluster(self): """ Test case - load example sequence cluster document data """ try: dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # docBySequenceD, docByClusterD = self.__testExtract( dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData, levels=self.__levels) # dList = docBySequenceD[self.__entitySchemaName] ok = dl.load( "sequence_clusters", "entity_members", loadType="full", documentList=dList, indexAttributeList=["data_set_id", "entry_id", "entity_id"], keyNames=None) self.assertTrue(ok) dList = docByClusterD[self.__clusterSchemaName] ok = dl.load( "sequence_clusters", "cluster_members", loadType="full", documentList=dList, indexAttributeList=["data_set_id", "identity", "cluster_id"], keyNames=None) self.assertTrue(ok) pD = self.__fetchProvenance() ok = dl.load("sequence_clusters", "cluster_provenance", loadType="full", documentList=[pD], indexAttributeList=None, keyNames=None) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def loadStatus(self, statusList, readBackCheck=True): ret = False try: dl = DocumentLoader(self.__cfgOb, self.__cachePath, "MONGO_DB", numProc=1, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck) # sectionName = "data_exchange_configuration" databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ret = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=[ "update_id", "database_name", "object_name" ], keyNames=None) except Exception as e: logger.exception("Failing with %s", str(e)) return ret
def __loadStatus(self, statusList): sectionName = "data_exchange_configuration" dl = DocumentLoader( self.__cfgOb, self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=None, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ok = dl.load( databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) return ok
def testLoadExchangeStatus(self): """ Test case - load data exchange status objects. [data_exchange] DATABASE_NAME=data_exchange DATABASE_VERSION_STRING=v5 COLLECTION_UPDATE_STATUS=rcsb_data_exchange_status COLLECTION_VERSION_STRING=v0_1 """ try: for ii in range(1, 100): collectionName = "my_collection_" + str(ii) dList = [] desp = DataExchangeStatus() tS = desp.setStartTime() self.assertGreaterEqual(len(tS), 15) ok = desp.setObject("my_database", collectionName) self.assertTrue(ok) ok = desp.setStatus(updateId=None, successFlag="Y") self.assertTrue(ok) # tS = desp.setEndTime() self.assertGreaterEqual(len(tS), 15) dList.append(desp.getStatus()) # self.assertEqual(len(dList), 1) logger.debug("Status record %r", dList[0]) sectionName = "data_exchange_configuration" dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) # collectionVersion = self.__cfgOb.get('COLLECTION_VERSION_STRING', sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) if ii == 1: loadType = "full" else: loadType = "append" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def loadStatus(statusList, cfgOb, cachePath, readBackCheck=True): sectionName = "data_exchange_configuration" dl = DocumentLoader(cfgOb, cachePath, "MONGO_DB", numProc=2, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck) # databaseName = cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ok = dl.load( databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) return ok
def load(self, updateId, extResource, loadType="full"): """Load chemical reference integrated data for the input external resource-""" try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # if extResource == "DrugBank": databaseName = "drugbank_core" configName = self.__cfgOb.getDefaultSectionName() user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName) pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName) # dbP = DrugBankProvider(cachePath=self.__cachePath, useCache=self.__useCache, username=user, password=pw) # crExt = ChemRefExtractor(self.__cfgOb) idD = crExt.getChemCompAccessionMapping(extResource) dList = dbP.getDocuments(mapD=idD) # logger.info("Resource %r extracted mapped document length %d", extResource, len(dList)) logger.debug("Objects %r", dList[:2]) sD, _, collectionList, _ = self.__schP.getSchemaInfo( databaseName) collectionName = collectionList[ 0] if collectionList else "unassigned" indexL = sD.getDocumentIndex(collectionName, "primary") logger.info("Database %r collection %r index attributes %r", databaseName, collectionName, indexL) # collectionVersion = sD.getCollectionVersion(collectionName) addValues = {"_schema_version": collectionVersion} # addValues = {} # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def load(self, updateId, loadType="full"): """Load legacy repository holdings and status data - Relevant configuration options: [DEFAULT] RCSB_EXCHANGE_SANDBOX_PATH=MOCK_EXCHANGE_SANDBOX [repository_holdings_configuration] DATABASE_NAME=repository_holdings DATABASE_VERSION_STRING=v5 COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry COLLECTION_VERSION_STRING=v0_1 """ try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() discoveryMode = self.__cfgOb.get("DISCOVERY_MODE", sectionName=self.__cfgSectionName, default="local") # --- baseUrlPDB = self.__cfgOb.getPath( "PDB_REPO_URL", sectionName=self.__cfgSectionName, default="https://ftp.wwpdb.org/pub") fallbackUrlPDB = self.__cfgOb.getPath( "PDB_REPO_FALLBACK_URL", sectionName=self.__cfgSectionName, default="https://ftp.wwpdb.org/pub") edMapUrl = self.__cfgOb.getPath("RCSB_EDMAP_LIST_PATH", sectionName=self.__cfgSectionName, default=None) # kwD = { "holdingsTargetUrl": os.path.join(baseUrlPDB, "pdb", "holdings"), "holdingsFallbackUrl": os.path.join(fallbackUrlPDB, "pdb", "holdings"), "edmapsLocator": edMapUrl, "updateTargetUrl": os.path.join(baseUrlPDB, "pdb", "data", "status", "latest"), "updateFallbackUrl": os.path.join(fallbackUrlPDB, "pdb", "data", "status", "latest"), "filterType": self.__filterType, } # --- if discoveryMode == "local": rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb, sandboxPath=self.__sandboxPath, cachePath=self.__cachePath, filterType=self.__filterType) else: rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath, **kwD) # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # sectionName = "repository_holdings_configuration" databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName) # addValues = {"_schema_version": collectionVersion} addValues = None # dList = rhdp.getHoldingsUpdateEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # dList = rhdp.getHoldingsCurrentEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) dList = rhdp.getHoldingsUnreleasedEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # dList = rhdp.getHoldingsRemovedEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # dList = rhdp.getHoldingsCombinedEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def load(self, updateId, loadType="full", doLoad=True): """Load tree node lists and status data - Relevant configuration options: tree_node_lists_configuration: DATABASE_NAME: tree_node_lists DATABASE_VERSION_STRING: v5 COLLECTION_VERSION_STRING: 1.0.0 COLLECTION_TAXONOMY: tree_taxonomy_node_list COLLECTION_ENZYME: tree_ec_node_list COLLECTION_SCOP: tree_scop_node_list COLLECTION_CATH: tree_cath_node_list """ try: useCache = self.__useCache # # if not useCache: # cDL = ["domains_struct", "NCBI", "ec", "go", "atc"] # for cD in cDL: # try: # cfp = os.path.join(self.__cachePath, cD) # os.makedirs(cfp, 0o755) # except Exception: # pass # # # try: # cfp = os.path.join(self.__cachePath, cD) # fpL = glob.glob(os.path.join(cfp, "*")) # if fpL: # for fp in fpL: # os.remove(fp) # except Exception: # pass # # logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache) # self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = "tree_node_lists" # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName) # addValues = {"_schema_version": collectionVersion} addValues = None # --- GO goP = GeneOntologyProvider(goDirPath=os.path.join( self.__cachePath, "go"), useCache=useCache) ok = goP.testCache() anEx = AnnotationExtractor(self.__cfgOb) goIdL = anEx.getUniqueIdentifiers("GO") logger.info("Unique GO assignments %d", len(goIdL)) nL = goP.exportTreeNodeList(goIdL) logger.info("GO tree node list length %d", len(nL)) if doLoad: collectionName = "tree_go_node_list" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # ---- CATH ccu = CathClassificationProvider(cachePath=self.__cachePath, useCache=useCache) nL = ccu.getTreeNodeList() logger.info("Starting load SCOP node tree length %d", len(nL)) if doLoad: collectionName = "tree_cath_node_list" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # ---- SCOP scu = ScopClassificationProvider(cachePath=self.__cachePath, useCache=useCache) nL = scu.getTreeNodeList() logger.info("Starting load SCOP node tree length %d", len(nL)) if doLoad: collectionName = "tree_scop_node_list" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # --- SCOP2 scu = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache) nL = scu.getTreeNodeList() logger.info("Starting load SCOP2 node tree length %d", len(nL)) if doLoad: collectionName = "tree_scop2_node_list" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # ---- Ecod ecu = EcodClassificationProvider(cachePath=self.__cachePath, useCache=useCache) nL = ecu.getTreeNodeList() logger.info("Starting load ECOD node tree length %d", len(nL)) if doLoad: collectionName = "tree_ecod_node_list" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # ---- EC edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath, useCache=useCache) nL = edbu.getTreeNodeList() logger.info("Starting load of EC node tree length %d", len(nL)) if doLoad: collectionName = "tree_ec_node_list" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # ---- Taxonomy # Get the taxon coverage in the current data set - epe = TaxonomyExtractor(self.__cfgOb) tL = epe.getUniqueTaxons() logger.info("Taxon coverage length %d", len(tL)) # tU = TaxonomyProvider(cachePath=self.__cachePath, useCache=useCache) fD = {1} for taxId in tL: fD.update({k: True for k in tU.getLineage(taxId)}) logger.info("Taxon filter dictionary length %d", len(fD)) # logger.info("fD %r" % sorted(fD)) # nL = tU.exportNodeList(filterD=fD) self.__checkTaxonNodeList(nL) logger.info("Starting load of taxonomy node tree length %d", len(nL)) if doLoad: collectionName = "tree_taxonomy_node_list" logger.debug("Taxonomy nodes (%d) %r", len(nL), nL[:5]) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) logger.info("Tree loading operations completed.") # # --- ATC crEx = ChemRefExtractor(self.__cfgOb) atcFilterD = crEx.getChemCompAccessionMapping("ATC") logger.info("Length of ATC filter %d", len(atcFilterD)) atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache) nL = atcP.getTreeNodeList(filterD=atcFilterD) collectionName = "tree_atc_node_list" logger.debug("ATC node list length %d %r", len(nL), nL[:5]) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # # --- logger.info("Completed tree node list loading operations.\n") return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def etl(self, dataSetId, dataLocator=None, loadType="full"): """ Prepare and load sequence cluster data by entity and by cluster identifer. """ try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # docBySequenceD, docByClusterD = self.__extract( dataSetId=dataSetId, dataLocator=dataLocator, levels=self.__identityLevels) # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__databaseName # addValues = {"_schema_version": self.__collectionVersion} addValues = None # collectionName = self.__entityMemberCollection dList = docBySequenceD[self.__entitySchemaName] ok1 = dl.load( databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=self.__entityMemberCollectionIndexL, keyNames=None, addValues=addValues) self.__updateStatus(dataSetId, databaseName, collectionName, ok1, statusStartTimestamp) # collectionName = self.__clusterMembersCollection dList = docByClusterD[self.__clusterSchemaName] ok2 = dl.load( databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=self.__clusterMembersCollectionIndexL, keyNames=None, addValues=addValues) self.__updateStatus(dataSetId, databaseName, collectionName, ok2, statusStartTimestamp) # pD = self.__fetchProvenance() collectionName = self.__clusterProvenanceCollection ok3 = dl.load(databaseName, collectionName, loadType=loadType, documentList=[pD], indexAttributeList=None, keyNames=None, addValues=addValues) self.__updateStatus(dataSetId, databaseName, collectionName, ok3, statusStartTimestamp) # return ok1 and ok2 and ok3 except Exception as e: logger.exception("Failing with %s", str(e)) return False
def testLoadHoldingsRemote(self): """Test case - load legacy repository holdings and status data - [repository_holdings] DATABASE_NAME=repository_holdings DATABASE_VERSION_STRING=v5 COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry COLLECTION_HOLDINGS_COMBINED=rcsb_repository_holdings_combined_entry """ try: sectionName = "repository_holdings_configuration" rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath, filterType=self.__filterType) # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) logger.info("databaseName %r", databaseName) addValues = None # maxDoc = 5 dList = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE", sectionName=sectionName) logger.info("collectionName %r", collectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def load(self, updateId, loadType="full"): """Load legacy repository holdings and status data - Relevant configuration options: [DEFAULT] RCSB_EXCHANGE_SANDBOX_PATH=MOCK_EXCHANGE_SANDBOX [repository_holdings_configuration] DATABASE_NAME=repository_holdings DATABASE_VERSION_STRING=v5 COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry COLLECTION_VERSION_STRING=v0_1 """ try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() sectionName = "repository_holdings_configuration" rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb, sandboxPath=self.__sandboxPath, cachePath=self.__cachePath, filterType=self.__filterType) # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName) # addValues = {"_schema_version": collectionVersion} addValues = None # dList = rhdp.getHoldingsUpdateEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # dList = rhdp.getHoldingsCurrentEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) dList = rhdp.getHoldingsUnreleasedEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # dList = rhdp.getHoldingsRemovedEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # dList = rhdp.getHoldingsCombinedEntry(updateId=updateId) collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) # return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def load(self, updateId, extResource, loadType="full"): """Load sequence reference data""" try: self.__statusList = [] desp = DataExchangeStatus() statusStartTimestamp = desp.setStartTime() # dList = indexL = [] databaseName = collectionName = collectionVersion = None # if extResource == "UniProt": databaseName = "uniprot_core" # configName = self.__cfgOb.getDefaultSectionName() # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName())) # ok, rsP = self.__getReferenceSequenceProvider() if not ok: return False # dList = rsP.getDocuments() logger.info("Resource %r extracted mapped document length %d", extResource, len(dList)) logger.debug("Objects %r", dList[:2]) # cDL = self.__docHelper.getCollectionInfo(databaseName) collectionName = cDL[0]["NAME"] collectionVersion = cDL[0]["VERSION"] indexL = self.__docHelper.getDocumentIndexAttributes( collectionName, "primary") logger.info( "Database %r collection %r version %r index attributes %r", databaseName, collectionName, collectionVersion, indexL) addValues = {} else: logger.error("Unsupported external resource %r", extResource) # if self.__doValidate: self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full") for dObj in dList: self.__validateObj(databaseName, collectionName, dObj, label="Original") # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues) okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp) return ok and okS except Exception as e: logger.exception("Failing with %s", str(e)) return False