def setUp(self):
     configPath = os.path.join(HERE, "test-data", "drugbank-config-example.yml")
     configName = "site_info_configuration"
     cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName)
     self.__user = cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
     self.__pw = cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
     self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
     self.__fastaPath = os.path.join(HERE, "test-output", "drugbank-targets.fa")
     self.__taxonPath = os.path.join(HERE, "test-output", "drugbank-targets-taxon.tdd")
     #
     self.__seqMatchResultsPath = os.path.join(HERE, "test-data", "drugbank-vs-pdbprent-filtered-results.json.gz")
     self.__startTime = time.time()
     logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
예제 #2
0
class CitationUtilsTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(CitationUtilsTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)

        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__exdbDirPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        self.__mU = MarshalUtil()
        self.__entryLimitTest = 20
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testEntryCitationAccess(self):
        """Test case - extract entry citations"""
        try:
            ce = CitationUtils(self.__cfgOb,
                               exdbDirPath=self.__exdbDirPath,
                               useCache=True,
                               cacheKwargs=self.__cacheKwargs,
                               entryLimit=self.__entryLimitTest)
            eCount = ce.getCitationEntryCount()
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #3
0
 def testExportToYaml(self):
     cfgOb = ConfigUtil(configFormat="yaml",
                        mockTopPath=self.__mockTopPath,
                        roundTrip=True)
     #
     cD = self.__createDataSet()
     cfgOb.importConfig(cD)
     #
     ok = cfgOb.writeConfig(self.__outPathConfigYamlExport,
                            configFormat="yaml")
     self.assertTrue(ok)
     cfgOb = ConfigUtil(configPath=self.__outPathConfigYamlExport,
                        configFormat="yaml",
                        mockTopPath=self.__mockTopPath)
     rD = cfgOb.exportConfig()
     self.assertGreaterEqual(len(rD), 1)
     v = cfgOb.get("SubA.Name", sectionName="Section1")
     self.assertEqual(v, "THE_NAME")
     v = cfgOb.get("SubA.Counts", sectionName="Section3")
     self.assertEqual(len(v), 3)
예제 #4
0
class DictionaryProviderTests(unittest.TestCase):
    def setUp(self):
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dirPath = os.path.join(self.__cachePath, "dictionaries")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__contentInfoConfigName = "content_info_helper_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        dictLocatorMap = self.__cfgOb.get(
            "DICT_LOCATOR_CONFIG_MAP",
            sectionName=self.__contentInfoConfigName)
        schemaName = "pdbx_core"
        self.__dictLocators = [
            self.__cfgOb.getPath(configLocator, sectionName=self.__configName)
            for configLocator in dictLocatorMap[schemaName]
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testResourceCache(self):
        """Test case - generate and check dictonary artifact and api caches
        """
        try:
            logger.debug("Dictionary locators %r", self.__dictLocators)
            dp = DictionaryApiProvider(dirPath=self.__dirPath, useCache=False)
            dApi = dp.getApi(self.__dictLocators)
            ok = dApi.testCache()
            self.assertTrue(ok)
            title = dApi.getDictionaryTitle()
            logger.debug("Title %r", title)
            self.assertEqual(
                title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic")
            # revL = dApi.getDictionaryHistory()
            numRev = dApi.getDictionaryRevisionCount()
            logger.debug("Number of dictionary revisions (numRev) %r", numRev)
            self.assertGreater(numRev, 220)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class EntityPolymerExtractorFixture(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(EntityPolymerExtractorFixture, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__cacheKwargs = {"fmt": "pickle"}
        self.__exdbCacheDirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        self.__entryLimitTest = None
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testExtractEntityPolymers(self):
        """Fixture - extract and save entity polymer info"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest)
            eCount = epe.getEntryCount()
            self.assertGreaterEqual(eCount, 10)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #6
0
    def testReadIniConfig(self):
        try:
            cfgOb = ConfigUtil(configPath=self.__inpPathConfigIni,
                               mockTopPath=self.__dataPath)
            sName = "DEFAULT"
            pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName)
            #
            self.assertEqual(
                pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO"))
            self.assertEqual(
                pathPdbx, os.path.join(self.__mockTopPath,
                                       "MOCK_PDBX_SANDBOX"))

            pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName)

            self.assertEqual(pathBird, "MOCK_BIRD_REPO")
            self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX")
            sName = "Section1"
            #
            helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE",
                                           sectionName=sName)

            tv = helperMethod.echo("test_value")
            self.assertEqual(tv, "test_value")
            #
            tEnv = "TEST_ENV_VAR"
            tVal = "TEST_ENV_VAR_VALUE"
            os.environ[tEnv] = tVal
            eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName)
            self.assertEqual(tVal, eVal)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}
예제 #7
0
    def testReadIniConfigWithEnv(self):
        try:
            os.environ["TEST_MOCKPATH_ENV"] = self.__mockTopPath
            cfgOb = ConfigUtil(configPath=self.__inpPathConfigWithEnvIni,
                               mockTopPath=self.__mockTopPath,
                               importEnvironment=True)
            testEnv = cfgOb.get("test_mockpath_env")
            self.assertEqual(testEnv, self.__mockTopPath)
            logger.debug("Environmental variable test_mock_path is %r",
                         testEnv)
            #  Verify environment keys all lowercased -
            testEnv = cfgOb.get("TEST_MOCKPATH_ENV")
            self.assertEqual(testEnv, None)
            logger.debug("Environmental variable TEST_MOCK_PATH is %r",
                         testEnv)
            #
            testEnv = cfgOb.get("TOP_PROJECT_PATH")
            self.assertEqual(testEnv, self.__mockTopPath)
            logger.debug("Derived path is %r", testEnv)
            #
            sName = "Section1"
            testEnv = cfgOb.get("PROJ_DIR_PATH", sectionName=sName)
            self.assertEqual(testEnv, os.path.join(self.__mockTopPath,
                                                   "da_top"))

            testEnv = cfgOb.get("PROJ_ARCHIVE_PATH", sectionName=sName)
            self.assertEqual(
                testEnv, os.path.join(self.__mockTopPath, "da_top", "archive"))

            testEnv = cfgOb.get("proj_deposit_path", sectionName=sName)
            self.assertEqual(
                testEnv, os.path.join(self.__mockTopPath, "da_top", "deposit"))
            #
            ok = cfgOb.writeConfig(self.__outPathConfigWithEnvIni,
                                   configFormat="ini")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}
class PharosTargetProviderTests(unittest.TestCase):
    skipFull = True

    def setUp(self):
        configPath = os.path.join(HERE, "test-data",
                                  "pharos-config-example.yml")
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=self.__configName)
        self.__user = self.__cfgOb.get("_MYSQL_DB_USER_NAME",
                                       sectionName=self.__configName)
        self.__pw = self.__cfgOb.get("_MYSQL_DB_PASSWORD",
                                     sectionName=self.__configName)
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__dirPath = os.path.join(self.__cachePath, "Pharos-targets")
        self.__dataPath = os.path.join(HERE, "test-data")
        #
        self.__pharosFixture()

    def tearDown(self):
        pass
        #

    def __pharosFixture(self):
        try:
            ok = False
            fU = FileUtil()
            srcPath = os.path.join(self.__dataPath, "Pharos")
            dstPath = self.__dirPath
            for fn in [
                    "drug_activity", "cmpd_activity", "target", "protein",
                    "t2tc"
            ]:
                inpPath = os.path.join(srcPath, fn + ".tdd.gz")
                outPath = os.path.join(dstPath, fn + ".tdd.gz")
                fU.get(inpPath, outPath)
                fU.uncompress(outPath, outputDir=dstPath)
                fU.remove(outPath)
            fU.put(os.path.join(srcPath, "pharos-readme.txt"),
                   os.path.join(dstPath, "pharos-readme.txt"))
            ok = True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False
        return ok

    @unittest.skip("Bootstrap test")
    def testBootstrap(self):
        try:
            ptP = PharosTargetProvider(cachePath=self.__cachePath,
                                       useCache=False,
                                       reloadDb=False)
            configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                      "dbload-setup-example.yml")
            configName = "site_info_remote_configuration"
            cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName)
            ok = ptP.backup(cfgOb, configName)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skipIf(skipFull, "Database dependency")
    def testFetchAndLoadPharosTargets(self):
        try:
            # Now about 630s on macos
            ptP = PharosTargetProvider(cachePath=self.__cachePath,
                                       useCache=False,
                                       reloadDb=True,
                                       fromDb=True,
                                       mysqlUser=self.__user,
                                       mysqlPassword=self.__pw)
            ok = ptP.testCache()
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skipIf(skipFull, "Very long test")
    def testExportPharosTargets(self):
        try:
            ptP = PharosTargetProvider(cachePath=self.__cachePath,
                                       useCache=True,
                                       reloadDb=False,
                                       mysqlUser=self.__user,
                                       mysqlPassword=self.__pw)
            ok = ptP.testCache()
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testExportPharosTargetFasta(self):
        try:
            ptP = PharosTargetProvider(cachePath=self.__cachePath,
                                       useCache=True,
                                       reloadDb=False)
            ok = ptP.testCache()
            self.assertTrue(ok)
            fastaPath = self.__cachePath = os.path.join(
                HERE, "test-output", "pharos-targets.fa")
            taxonPath = self.__cachePath = os.path.join(
                HERE, "test-output", "pharos-targets-taxon.tdd")
            ok = ptP.exportProteinFasta(fastaPath,
                                        taxonPath,
                                        addTaxonomy=False)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skipIf(skipFull, "Internal test")
    def testStashDependencies(self):
        try:
            ptP = PharosTargetProvider(cachePath=self.__cachePath,
                                       useCache=True,
                                       reloadDb=False,
                                       fromDb=False)
            ok = ptP.testCache()
            self.assertTrue(ok)
            #
            ok = ptP.backup(self.__cfgOb, self.__configName)
            self.assertTrue(ok)
            #
            ok = ptP.restore(self.__cfgOb, self.__configName)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skipIf(skipFull, "Very long test")
    def testExportPharosTargetFastaTax(self):
        try:
            ptP = PharosTargetProvider(cachePath=self.__cachePath,
                                       useCache=True,
                                       reloadDb=False)
            ok = ptP.testCache()
            self.assertTrue(ok)
            #
            fastaPath = self.__cachePath = os.path.join(
                HERE, "test-output", "pharos-targets.fa")
            taxonPath = self.__cachePath = os.path.join(
                HERE, "test-output", "pharos-targets-taxon.tdd")
            ok = ptP.exportProteinFasta(fastaPath, taxonPath, addTaxonomy=True)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument("--full",
                        default=False,
                        action="store_true",
                        help="Fresh full load in a new tables/collections")
    parser.add_argument(
        "--replace",
        default=False,
        action="store_true",
        help="Load with replacement in an existing table/collection (default)")
    #
    parser.add_argument(
        "--load_chem_comp_ref",
        default=False,
        action="store_true",
        help="Load Chemical Component reference definitions (public subset)")
    parser.add_argument(
        "--load_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Load Chemical Component Core reference definitions (public subset)")
    parser.add_argument(
        "--load_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help=
        "Load Bird Chemical Component reference definitions (public subset)")
    parser.add_argument(
        "--load_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Load Bird Chemical Component Core reference definitions (public subset)"
    )
    parser.add_argument("--load_bird_ref",
                        default=False,
                        action="store_true",
                        help="Load Bird reference definitions (public subset)")
    parser.add_argument(
        "--load_bird_family_ref",
        default=False,
        action="store_true",
        help="Load Bird Family reference definitions (public subset)")
    parser.add_argument("--load_entry_data",
                        default=False,
                        action="store_true",
                        help="Load PDBx entry data (current released subset)")
    parser.add_argument(
        "--load_pdbx_core",
        default=False,
        action="store_true",
        help="Load all PDBx core collections (current released subset)")
    parser.add_argument(
        "--load_pdbx_core_merge",
        default=False,
        action="store_true",
        help=
        "Load all PDBx core collections with merged content (current released subset)"
    )
    #
    parser.add_argument("--load_pdbx_core_entry",
                        default=False,
                        action="store_true",
                        help="Load PDBx core entry (current released subset)")
    parser.add_argument("--load_pdbx_core_entity",
                        default=False,
                        action="store_true",
                        help="Load PDBx core entity (current released subset)")
    parser.add_argument(
        "--load_pdbx_core_entity_monomer",
        default=False,
        action="store_true",
        help="Load PDBx core entity monomer (current released subset)")
    parser.add_argument(
        "--load_pdbx_core_assembly",
        default=False,
        action="store_true",
        help="Load PDBx core assembly (current released subset)")
    parser.add_argument(
        "--load_ihm_dev",
        default=False,
        action="store_true",
        help="Load I/HM DEV model data (current released subset)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")

    parser.add_argument("--db_type",
                        default="mongo",
                        help="Database server type (default=mongo)")

    parser.add_argument(
        "--document_style",
        default="rowwise_by_name_with_cardinality",
        help=
        "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name",
    )
    parser.add_argument("--read_back_check",
                        default=False,
                        action="store_true",
                        help="Perform read back check on all documents")
    parser.add_argument("--schema_level",
                        default=None,
                        help="Schema validation level (full|min default=None)")
    #
    parser.add_argument(
        "--load_file_list_path",
        default=None,
        help=
        "Input file containing load file path list (override automatic repository scan)"
    )
    parser.add_argument(
        "--fail_file_list_path",
        default=None,
        help="Output file containing file paths that fail to load")
    parser.add_argument(
        "--save_file_list_path",
        default=None,
        help="Save repo file paths from automatic file system scan in this path"
    )

    parser.add_argument("--num_proc",
                        default=2,
                        help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size",
                        default=10,
                        help="Number of files loaded per process")
    parser.add_argument("--file_limit",
                        default=None,
                        help="Load file limit for testing")
    parser.add_argument("--prune_document_size",
                        default=None,
                        help="Prune large documents to this size limit (MB)")
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument("--mock",
                        default=False,
                        action="store_true",
                        help="Use MOCK repository configuration for testing")
    parser.add_argument("--cache_path",
                        default=None,
                        help="Cache path for resource files")
    parser.add_argument("--rebuild_cache",
                        default=False,
                        action="store_true",
                        help="Rebuild cached resource files")
    parser.add_argument("--rebuild_schema",
                        default=False,
                        action="store_true",
                        help="Rebuild schema on-the-fly if not cached")
    parser.add_argument("--vrpt_repo_path",
                        default=None,
                        help="Path to validation report repository")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)

        #
        if args.vrpt_repo_path:
            vrptPath = args.vrpt_repo_path
            if not os.access(vrptPath, os.R_OK):
                logger.error("Unreadable validation report repository path %r",
                             vrptPath)
            envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName)
            os.environ[envName] = vrptPath
            logger.info("Using alternate validation report path %s",
                        os.getenv(envName))

    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)

    #
    try:
        readBackCheck = args.read_back_check
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        fileLimit = int(args.file_limit) if args.file_limit else None
        failedFilePath = args.fail_file_list_path
        fPath = args.load_file_list_path
        schemaLevel = args.schema_level if args.schema_level in [
            "min", "full", "minimum"
        ] else None
        loadType = "full" if args.full else "replace"
        loadType = "replace" if args.replace else "full"
        saveInputFileListPath = args.save_file_list_path
        pruneDocumentSize = float(
            args.prune_document_size) if args.prune_document_size else None
        cachePath = args.cache_path if args.cache_path else "."
        cachePath = os.path.abspath(cachePath)
        rebuildCache = args.rebuild_cache if args.rebuild_cache else False
        rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False
        if args.document_style not in [
                "rowwise_by_name", "rowwise_by_name_with_cardinality",
                "columnwise_by_name", "rowwise_by_id", "rowwise_no_name"
        ]:
            logger.error("Unsupported document style %s", args.document_style)
        if args.db_type != "mongo":
            logger.error("Unsupported database server type %s", args.db_type)
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)

    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #  Rebuild or check resource cache
    okS = True
    ok = buildResourceCache(cfgOb,
                            configName,
                            cachePath,
                            rebuildCache=rebuildCache)
    if not ok:
        logger.error("Cache rebuild or check failure (rebuild %r) %r",
                     rebuildCache, cachePath)
        exit(1)

    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    # Read any input path lists -
    #
    inputPathList = None
    if fPath:
        mu = MarshalUtil(workPath=cachePath)
        inputPathList = mu.doImport(fPath, fmt="list")
        if not inputPathList:
            logger.error("Missing or empty input file path list %s", fPath)
            exit(1)
    #
    ##
    if args.db_type == "mongo":
        mw = PdbxLoader(
            cfgOb,
            cachePath,
            resourceName="MONGO_DB",
            numProc=numProc,
            chunkSize=chunkSize,
            fileLimit=fileLimit,
            verbose=debugFlag,
            readBackCheck=readBackCheck,
            rebuildSchemaFlag=rebuildSchemaFlag,
        )

        if args.load_chem_comp_ref:
            ok = mw.load(
                "chem_comp",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_chem_comp_core_ref:
            ok = mw.load(
                "chem_comp_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_chem_comp_ref:
            ok = mw.load(
                "bird_chem_comp",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_chem_comp_core_ref:
            ok = mw.load(
                "bird_chem_comp_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_ref:
            ok = mw.load(
                "bird",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_family_ref:
            ok = mw.load(
                "bird_family",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["BIRD_FAMILY_PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_entry_data:
            ok = mw.load(
                "pdbx",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_pdbx_core:
            ok = mw.load(
                "pdbx_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_merge:
            ok = mw.load(
                "pdbx_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
                mergeContentTypes=["vrpt"],
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_entity:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_entity"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_entity_monomer:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_entity_monomer"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_entry:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_entry"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_pdbx_core_assembly:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_assembly"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_ihm_dev:
            ok = mw.load(
                "ihm_dev",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        logger.info("Operation completed with status %r " % ok and okS)
예제 #10
0
class ExDbWorkflow(object):
    def __init__(self, **kwargs):
        #  Configuration Details
        configPath = kwargs.get("configPath", "exdb-config-example.yml")
        self.__configName = kwargs.get("configName", "site_info_configuration")
        mockTopPath = kwargs.get("mockTopPath", None)
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath)
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__cachePath = os.path.abspath(self.__cachePath)
        self.__debugFlag = kwargs.get("debugFlag", False)
        if self.__debugFlag:
            logger.setLevel(logging.DEBUG)
        #
        #  Rebuild or check resource cache
        rebuildCache = kwargs.get("rebuildCache", False)
        self.__useCache = not rebuildCache
        restoreUseGit = kwargs.get("restoreUseGit", True)
        restoreUseStash = kwargs.get("restoreUseStash", True)
        providerTypeExclude = kwargs.get("providerTypeExclude", None)
        #
        self.__cacheStatus = True
        rebuildCache = False
        if rebuildCache:
            self.__cacheStatus = self.buildResourceCache(
                rebuildCache=rebuildCache,
                providerTypeExclude=providerTypeExclude,
                restoreUseGit=restoreUseGit,
                restoreUseStash=restoreUseStash,
            )
            logger.debug("Cache status if %r", self.__cacheStatus)
        #

    def load(self, op, **kwargs):
        logger.info("Starting operation %r\n", op)
        if not self.__cacheStatus:
            logger.error("Resource cache test or rebuild has failed - exiting")
            return False
        # argument processing
        if op not in ["etl_tree_node_lists", "etl_chemref", "etl_uniprot_core", "upd_ref_seq", "upd_ref_seq_comp_models", "refresh_pubchem"]:
            logger.error("Unsupported operation %r - exiting", op)
            return False
        try:
            # test mode and UniProt accession primary match minimum count for doReferenceSequenceUpdate()
            testMode = kwargs.get("testMode", False)
            minMatchPrimaryPercent = kwargs.get("minMatchPrimaryPercent", None)
            minMissing = kwargs.get("minMissing", 0)
            #
            readBackCheck = kwargs.get("readBackCheck", False)
            numProc = int(kwargs.get("numProc", 1))
            chunkSize = int(kwargs.get("chunkSize", 10))
            refChunkSize = int(kwargs.get("refChunkSize", 100))
            documentLimit = int(kwargs.get("documentLimit")) if "documentLimit" in kwargs else None
            loadType = kwargs.get("loadType", "full")  # or replace
            dbType = kwargs.get("dbType", "mongo")
            tU = TimeUtil()
            dataSetId = kwargs.get("dataSetId") if "dataSetId" in kwargs else tU.getCurrentWeekSignature()
            #  Rebuild or reuse reference sequence cache
            rebuildSequenceCache = kwargs.get("rebuildSequenceCache", False)
            useSequenceCache = not rebuildSequenceCache
            #
        except Exception as e:
            logger.exception("Argument or configuration processing failing with %s", str(e))
            return False
        #
        okS = ok = False
        if dbType == "mongo":
            if op == "etl_tree_node_lists":
                rhw = TreeNodeListWorker(
                    self.__cfgOb,
                    self.__cachePath,
                    numProc=numProc,
                    chunkSize=chunkSize,
                    documentLimit=documentLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                    useCache=self.__useCache,
                )
                ok = rhw.load(dataSetId, loadType=loadType)
                okS = self.loadStatus(rhw.getLoadStatus(), readBackCheck=readBackCheck)

            elif op == "etl_chemref":
                crw = ChemRefEtlWorker(
                    self.__cfgOb,
                    self.__cachePath,
                    numProc=numProc,
                    chunkSize=chunkSize,
                    documentLimit=documentLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                    useCache=self.__useCache,
                )
                ok = crw.load(dataSetId, extResource="DrugBank", loadType=loadType)
                okS = self.loadStatus(crw.getLoadStatus(), readBackCheck=readBackCheck)

            elif op == "etl_uniprot_core":
                crw = UniProtCoreEtlWorker(
                    self.__cfgOb,
                    self.__cachePath,
                    numProc=numProc,
                    chunkSize=chunkSize,
                    documentLimit=documentLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                    useCache=self.__useCache,
                )
                ok = crw.load(dataSetId, extResource="UniProt", loadType=loadType)
                okS = self.loadStatus(crw.getLoadStatus(), readBackCheck=readBackCheck)

            elif op == "upd_ref_seq":
                databaseName = "pdbx_core"
                collectionName = "pdbx_core_polymer_entity"
                polymerType = "Protein"
                ok = self.doReferenceSequenceUpdate(
                    databaseName,
                    collectionName,
                    polymerType,
                    fetchLimit=documentLimit,
                    useSequenceCache=useSequenceCache,
                    testMode=testMode,
                    minMatchPrimaryPercent=minMatchPrimaryPercent,
                    minMissing=minMissing,
                    refChunkSize=refChunkSize,
                )
                okS = ok
            elif op == "upd_ref_seq_comp_models":
                databaseName = "pdbx_comp_model_core"
                collectionName = "pdbx_comp_model_core_polymer_entity"
                polymerType = "Protein"
                ok = self.doReferenceSequenceUpdate(
                    databaseName,
                    collectionName,
                    polymerType,
                    fetchLimit=documentLimit,
                    useSequenceCache=useSequenceCache,
                    testMode=testMode,
                    minMatchPrimaryPercent=minMatchPrimaryPercent,
                    minMissing=minMissing,
                    refChunkSize=refChunkSize,
                )
                okS = ok
        #
        logger.info("Completed operation %r with status %r\n", op, ok and okS)
        return ok and okS

    def loadStatus(self, statusList, readBackCheck=True):
        ret = False
        try:
            dl = DocumentLoader(self.__cfgOb, self.__cachePath, "MONGO_DB", numProc=1, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck)
            #
            sectionName = "data_exchange_configuration"
            databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
            collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName)
            ret = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ret

    def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None, restoreUseGit=True, restoreUseStash=True):
        """Generate and cache resource dependencies."""
        ret = False
        try:
            rp = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                providerTypeExclude=providerTypeExclude,
                restoreUseGit=restoreUseGit,
                restoreUseStash=restoreUseStash,
            )
            ret = rp.cacheResources(useCache=not rebuildCache, doBackup=False)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ret

    def doReferenceSequenceUpdate(
        self, databaseName, collectionName, polymerType, fetchLimit=None, useSequenceCache=False, testMode=False, minMatchPrimaryPercent=None, minMissing=0, refChunkSize=50, **kwargs
    ):
        try:
            _ = kwargs
            _ = testMode
            # -------
            rsaP = ReferenceSequenceAnnotationProvider(
                self.__cfgOb, databaseName, collectionName, polymerType, useCache=useSequenceCache, cachePath=self.__cachePath, maxChunkSize=refChunkSize
            )
            ok = rsaP.testCache(minMatchPrimaryPercent=minMatchPrimaryPercent, minMissing=minMissing)
            if ok:
                rsa = ReferenceSequenceAnnotationAdapter(rsaP)
                obTr = ObjectTransformer(self.__cfgOb, objectAdapter=rsa)
                ok = obTr.doTransform(
                    databaseName=databaseName, collectionName=collectionName, fetchLimit=fetchLimit, selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType}
                )
            else:
                logger.error("Reference sequence data cache build failing")
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
예제 #11
0
class RepoLoadWorkflow(object):
    def __init__(self, **kwargs):
        #  Configuration Details
        configPath = kwargs.get("configPath", "exdb-config-example.yml")
        self.__configName = kwargs.get("configName", "site_info_configuration")
        mockTopPath = kwargs.get("mockTopPath", None)
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=mockTopPath)
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__cachePath = os.path.abspath(self.__cachePath)
        self.__debugFlag = kwargs.get("debugFlag", False)
        if self.__debugFlag:
            logger.setLevel(logging.DEBUG)
        #
        #  Rebuild or check resource cache
        # rebuildCache = kwargs.get("rebuildCache", False)
        # self.__cacheStatus = self.buildResourceCache(rebuildCache=rebuildCache)
        # logger.debug("Cache status if %r", self.__cacheStatus)
        #

    def load(self, op, **kwargs):
        # if not self.__cacheStatus:
        #    logger.error("Resource cache test or rebuild has failed - exiting")
        #    return False
        # argument processing
        if op not in [
                "pdbx-loader", "etl-repository-holdings",
                "etl-entity-sequence-clusters"
        ]:
            logger.error("Unsupported operation %r - exiting", op)
            return False
        try:
            readBackCheck = kwargs.get("readBackCheck", False)
            numProc = int(kwargs.get("numProc", 1))
            chunkSize = int(kwargs.get("chunkSize", 10))
            fileLimit = int(
                kwargs.get("fileLimit")) if "fileLimit" in kwargs else None
            documentLimit = int(kwargs.get(
                "documentLimit")) if "documentLimit" in kwargs else None
            failedFilePath = kwargs.get("failFileListPath", None)
            loadFileListPath = kwargs.get("loadFileListPath", None)
            saveInputFileListPath = kwargs.get("saveFileListPath", None)
            schemaLevel = kwargs.get("schemaLevel",
                                     "min") if kwargs.get("schemaLevel") in [
                                         "min", "full"
                                     ] else "min"
            loadType = kwargs.get("loadType", "full")  # or replace
            updateSchemaOnReplace = kwargs.get("updateSchemaOnReplace", True)
            pruneDocumentSize = float(
                kwargs.get("pruneDocumentSize"
                           )) if "pruneDocumentSize" in kwargs else None

            # "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name",
            documentStyle = kwargs.get("documentStyle",
                                       "rowwise_by_name_with_cardinality")
            dbType = kwargs.get("dbType", "mongo")
            #
            databaseName = kwargs.get("databaseName", None)
            databaseNameList = self.__cfgOb.get(
                "DATABASE_NAMES_ALL",
                sectionName="database_catalog_configuration").split(",")
            collectionNameList = kwargs.get("collectionNameList", None)
            mergeValidationReports = kwargs.get("mergeValidationReports", True)
            #
            tU = TimeUtil()
            dataSetId = kwargs.get(
                "dataSetId"
            ) if "dataSetId" in kwargs else tU.getCurrentWeekSignature()
            seqDataLocator = self.__cfgOb.getPath(
                "RCSB_SEQUENCE_CLUSTER_DATA_PATH",
                sectionName=self.__configName)
            sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                               sectionName=self.__configName)

        except Exception as e:
            logger.exception(
                "Argument and configuration processing failing with %s",
                str(e))
            return False
        #

        if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList:
            okS = True
            try:
                inputPathList = None
                if loadFileListPath:
                    mu = MarshalUtil(workPath=self.__cachePath)
                    inputPathList = mu.doImport(loadFileListPath, fmt="list")
                    if not inputPathList:
                        logger.error(
                            "Operation %r missing or empty input file path list %s - exiting",
                            op, loadFileListPath)
                        return False
            except Exception as e:
                logger.exception(
                    "Operation %r processing input path list failing with %s",
                    op, str(e))
                return False
            #
            try:
                mw = PdbxLoader(
                    self.__cfgOb,
                    self.__cachePath,
                    resourceName="MONGO_DB",
                    numProc=numProc,
                    chunkSize=chunkSize,
                    fileLimit=fileLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                )
                ok = mw.load(
                    databaseName,
                    collectionLoadList=collectionNameList,
                    loadType=loadType,
                    inputPathList=inputPathList,
                    styleType=documentStyle,
                    dataSelectors=["PUBLIC_RELEASE"],
                    failedFilePath=failedFilePath,
                    saveInputFileListPath=saveInputFileListPath,
                    pruneDocumentSize=pruneDocumentSize,
                    validationLevel=schemaLevel,
                    mergeContentTypes=["vrpt"]
                    if mergeValidationReports else None,
                    updateSchemaOnReplace=updateSchemaOnReplace,
                )
                okS = self.loadStatus(mw.getLoadStatus(),
                                      readBackCheck=readBackCheck)
            except Exception as e:
                logger.exception("Operation %r database %r failing with %s",
                                 op, databaseName, str(e))
        elif op == "etl-entity-sequence-clusters" and dbType == "mongo":
            cw = SequenceClustersEtlWorker(self.__cfgOb,
                                           numProc=numProc,
                                           chunkSize=chunkSize,
                                           documentLimit=documentLimit,
                                           verbose=self.__debugFlag,
                                           readBackCheck=readBackCheck,
                                           workPath=self.__cachePath)
            ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
            okS = self.loadStatus(cw.getLoadStatus(),
                                  readBackCheck=readBackCheck)
        elif op == "etl-repository-holdings" and dbType == "mongo":
            rhw = RepoHoldingsEtlWorker(
                self.__cfgOb,
                sandboxPath,
                self.__cachePath,
                numProc=numProc,
                chunkSize=chunkSize,
                documentLimit=documentLimit,
                verbose=self.__debugFlag,
                readBackCheck=readBackCheck,
            )
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = self.loadStatus(rhw.getLoadStatus(),
                                  readBackCheck=readBackCheck)

        logger.info("Completed operation %r with status %r", op, ok and okS)

        return ok and okS

    def loadStatus(self, statusList, readBackCheck=True):
        ret = False
        try:
            dl = DocumentLoader(self.__cfgOb,
                                self.__cachePath,
                                "MONGO_DB",
                                numProc=1,
                                chunkSize=2,
                                documentLimit=None,
                                verbose=False,
                                readBackCheck=readBackCheck)
            #
            sectionName = "data_exchange_configuration"
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS",
                                              sectionName=sectionName)
            ret = dl.load(databaseName,
                          collectionName,
                          loadType="append",
                          documentList=statusList,
                          indexAttributeList=[
                              "update_id", "database_name", "object_name"
                          ],
                          keyNames=None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ret

    def buildResourceCache(self, rebuildCache=False):
        """Generate and cache resource dependencies.
        """
        ret = False
        try:
            useCache = not rebuildCache
            logger.info("Cache setting useCache is %r", useCache)
            rp = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath)
            ret = rp.cacheResources(useCache=useCache)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ret
예제 #12
0
class PdbxLoaderRemoteTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(PdbxLoaderRemoteTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        #
        self.__resourceName = "MONGO_DB"
        self.__failedFilePath = os.path.join(HERE, "test-output",
                                             "failed-list.txt")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 5
        self.__fileLimit = 5
        self.__documentStyle = "rowwise_by_name_with_cardinality"
        self.__ldList = [
            # {"databaseName": "chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "min"},
            {
                "databaseName": "bird_chem_comp_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": None,
                "validationLevel": "full",
                "updateSchemaOnReplace": False,
                "status": True,
            },
            {
                "databaseName": "bird_chem_comp_core",
                "collectionNameList": None,
                "loadType": "replace",
                "mergeContentTypes": None,
                "validationLevel": "full",
                "updateSchemaOnReplace": True,
                "status": True,
            },
            {
                "databaseName": "pdbx_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": ["vrpt"],
                "validationLevel": "full",
                "updateSchemaOnReplace": False,
                "status": True,
            },
            # {
            #    "databaseName": "pdbx_core",
            #    "collectionNameList": None,
            #    "loadType": "replace",
            #    "mergeContentTypes": ["vrpt"],
            #    "validationLevel": "full",
            #    "updateSchemaOnReplace": True,
            #    "status": True,
            # },
        ]  #
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testPdbxLoader(self):
        for ld in self.__ldList:
            self.__pdbxLoaderWrapper(**ld)

    def __pdbxLoaderWrapper(self, **kwargs):
        """Wrapper for PDBx loader module"""
        try:
            logger.info("Loading %s", kwargs["databaseName"])
            mw = PdbxLoader(
                self.__cfgOb,
                cachePath=self.__cachePath,
                resourceName=self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                fileLimit=self.__fileLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
                maxStepLength=2000,
                useSchemaCache=True,
                rebuildSchemaFlag=False,
            )
            ok = mw.load(
                kwargs["databaseName"],
                collectionLoadList=kwargs["collectionNameList"],
                loadType=kwargs["loadType"],
                inputPathList=None,
                styleType=self.__documentStyle,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=self.__failedFilePath,
                saveInputFileListPath=None,
                pruneDocumentSize=None,
                logSize=False,
                validationLevel=kwargs["validationLevel"],
                mergeContentTypes=kwargs["mergeContentTypes"],
                useNameFlag=False,
                updateSchemaOnReplace=kwargs["updateSchemaOnReplace"],
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            self.assertEqual(ok, kwargs["status"])
            ok = self.__loadStatus(mw.getLoadStatus())
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __loadStatus(self, statusList):
        sectionName = "data_exchange_configuration"
        dl = DocumentLoader(
            self.__cfgOb,
            self.__cachePath,
            resourceName=self.__resourceName,
            numProc=self.__numProc,
            chunkSize=self.__chunkSize,
            documentLimit=None,
            verbose=self.__verbose,
            readBackCheck=self.__readBackCheck,
        )
        #
        databaseName = self.__cfgOb.get("DATABASE_NAME",
                                        sectionName=sectionName)
        collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS",
                                          sectionName=sectionName)
        ok = dl.load(
            databaseName,
            collectionName,
            loadType="append",
            documentList=statusList,
            indexAttributeList=["update_id", "database_name", "object_name"],
            keyNames=None)
        return ok
예제 #13
0
class RepoHoldingsRemoteLoaderTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(RepoHoldingsRemoteLoaderTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)

        self.__resourceName = "MONGO_DB"
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 10
        self.__documentLimit = None
        self.__filterType = "assign-dates"
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        # sample data set
        self.__updateId = "2021_36"
        #
        eiP = EntryInfoProvider(cachePath=self.__cachePath, useCache=True)
        ok = eiP.testCache(minCount=0)
        self.assertTrue(ok)
        ok = eiP.restore(self.__cfgOb, configName, useStash=False, useGit=True)
        self.assertTrue(ok)
        ok = eiP.reload()
        self.assertTrue(ok)

        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testLoadHoldingsRemote(self):
        """Test case - load legacy repository holdings and status data -

        [repository_holdings]
        DATABASE_NAME=repository_holdings
        DATABASE_VERSION_STRING=v5
        COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry
        COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry
        COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry
        COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry
        COLLECTION_HOLDINGS_COMBINED=rcsb_repository_holdings_combined_entry

        """
        try:
            sectionName = "repository_holdings_configuration"
            rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath,
                                              filterType=self.__filterType)
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            logger.info("databaseName %r", databaseName)
            addValues = None
            #
            maxDoc = 5
            dList = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE",
                                              sectionName=sectionName)
            logger.info("collectionName %r", collectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #14
0
class SchemaDefDataPrepTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefDataPrepTests, self).__init__(methodName)
        self.__loadPathList = []
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def __timeStep(self, msg):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", msg,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testSimpleSchemaDefDataPrep(self):
        for tcD in self.__simpleTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__simpleSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"])

    def testFullSchemaDefDataPrep(self):
        for tcD in self.__fullTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__fullSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"],
                excludeExtras=tcD["excludeExtras"],
            )

    def __simpleSchemaDataPrep(self,
                               contentType,
                               filterType,
                               styleType,
                               mockLength,
                               rejectLength=0,
                               dataSelectors=None,
                               mergeContentTypes=None):
        """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            dataSelectors = dataSelectors if dataSelectors else [
                "PUBLIC_RELEASE"
            ]
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType,
                                                    dataTyping="ANY")
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            #

            logger.debug("For %s mock length %d length of path list %d\n",
                         contentType, mockLength, len(inputPathList))
            self.assertGreaterEqual(len(inputPathList), mockLength)
            tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments(
                inputPathList,
                styleType=styleType,
                filterType=filterType,
                dataSelectors=dataSelectors)
            logger.debug(
                "For %s mock length %d reject length %d length of tddl list %d\n",
                contentType, mockLength, rejectLength, len(tableDataDictList))
            self.assertGreaterEqual(len(tableDataDictList),
                                    mockLength - rejectLength)
            self.assertGreaterEqual(len(containerNameList),
                                    mockLength - rejectLength)

            if rejectList:
                logger.debug("For %s rejecting components %r", contentType,
                             rejectList)
            #
            self.assertEqual(len(rejectList), rejectLength)
            fName = "simple-prep-%s-%s.json" % (contentType, styleType)
            if self.__exportFlag:
                fPath = os.path.join(self.__outputPath, fName)
                self.__mU.doExport(fPath,
                                   tableDataDictList,
                                   fmt="json",
                                   indent=3)
            if self.__diffFlag:
                fPath = os.path.join(self.__savedOutputPath, fName)
                refDocList = self.__mU.doImport(fPath, fmt="json")
                self.assertEqual(len(refDocList), len(tableDataDictList))
                #
                jD = diff(refDocList,
                          tableDataDictList,
                          syntax="explicit",
                          marshal=True)
                if jD:
                    _, fn = os.path.split(fPath)
                    bn, _ = os.path.splitext(fn)
                    fPath = os.path.join(self.__outputPath, bn + "-diff.json")
                    logger.debug("jsondiff for %s %s = \n%s", contentType,
                                 styleType,
                                 pprint.pformat(jD, indent=3, width=100))
                    self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __logDocumentOrder(self, docList):
        for doc in docList:
            logger.debug("keys %r", list(doc.keys()))

    def __filterDocuments(self, docList, excludeList=None):
        excludeList = excludeList if excludeList else []
        for doc in docList:
            for excl in excludeList:
                if excl in doc:
                    del doc[excl]

    def __fullSchemaDataPrep(self,
                             contentType,
                             filterType,
                             styleType,
                             mockLength,
                             rejectLength=0,
                             dataSelectors=None,
                             mergeContentTypes=None,
                             excludeExtras=None):
        """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            excludeExtras = excludeExtras if excludeExtras else []
            _ = mockLength
            _ = rejectLength
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(
                databaseName=contentType, dataTyping="ANY")
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            #
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
            #
            for collectionName in collectionNameList:
                tableIdExcludeList = sd.getCollectionExcluded(collectionName)
                tableIdIncludeList = sd.getCollectionSelected(collectionName)
                sliceFilter = sd.getCollectionSliceFilter(collectionName)
                sdp.setSchemaIdExcludeList(tableIdExcludeList)
                sdp.setSchemaIdIncludeList(tableIdIncludeList)
                #
                docList, _, _ = sdp.processDocuments(
                    containerList,
                    styleType=styleType,
                    sliceFilter=sliceFilter,
                    filterType=filterType,
                    dataSelectors=dataSelectors,
                    collectionName=collectionName)

                docList = sdp.addDocumentPrivateAttributes(
                    docList, collectionName)
                docList = sdp.addDocumentSubCategoryAggregates(
                    docList, collectionName)

                # Special exclusions for the test harness. (removes timestamped data items to allow diffs.)
                self.__filterDocuments(docList, excludeExtras)
                mergeS = "-".join(
                    mergeContentTypes) if mergeContentTypes else ""
                fName = "full-prep-%s-%s-%s-%s.json" % (
                    contentType, collectionName, mergeS, styleType)
                if self.__exportFlag:
                    self.__logDocumentOrder(docList)
                    fPath = os.path.join(self.__outputPath, fName)
                    self.__mU.doExport(fPath, docList, fmt="json", indent=3)
                    logger.debug("Exported %r", fPath)
                #
                if self.__diffFlag:
                    fPath = os.path.join(self.__savedOutputPath, fName)
                    refDocList = self.__mU.doImport(fPath, fmt="json")
                    self.assertEqual(len(refDocList), len(docList))
                    logger.debug("For %s %s len refDocList %d", contentType,
                                 collectionName, len(refDocList))
                    logger.debug("For %s %s len docList %d", contentType,
                                 collectionName, len(docList))
                    jD = diff(refDocList,
                              docList,
                              syntax="explicit",
                              marshal=True)
                    if jD:
                        _, fn = os.path.split(fPath)
                        bn, _ = os.path.splitext(fn)
                        fPath = os.path.join(self.__outputPath,
                                             bn + "-diff.json")
                        logger.debug("jsondiff for %s %s = \n%s", contentType,
                                     collectionName,
                                     pprint.pformat(jD, indent=3, width=100))
                        self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                    self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #15
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--update_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_chem_comp_core_ref",
        default=False,
        action="store_true",
        help="Update core schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Update core schema for Bird Chemical Component reference definitions")

    parser.add_argument("--update_bird_ref",
                        default=False,
                        action="store_true",
                        help="Update schema for Bird reference definitions")
    parser.add_argument(
        "--update_bird_family_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Family reference definitions")

    parser.add_argument("--update_pdbx",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx entry data")
    parser.add_argument("--update_pdbx_core",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx core entry/entity data")
    parser.add_argument(
        "--update_pdbx_comp_model_core",
        default=False,
        action="store_true",
        help="Update schema for PDBx computational model core entry/entity data"
    )
    #
    parser.add_argument("--update_repository_holdings",
                        default=False,
                        action="store_true",
                        help="Update schema for repository holdings")
    parser.add_argument("--update_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="Update schema for entity sequence clusters")
    parser.add_argument("--update_data_exchange",
                        default=False,
                        action="store_true",
                        help="Update schema for data exchange status")
    parser.add_argument("--update_ihm_dev",
                        default=False,
                        action="store_true",
                        help="Update schema for I/HM dev entry data")
    parser.add_argument("--update_drugbank_core",
                        default=False,
                        action="store_true",
                        help="Update DrugBank schema")
    #
    parser.add_argument(
        "--update_config_all",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)")
    parser.add_argument(
        "--update_config_deployed",
        default=False,
        action="store_true",
        help=
        "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)")
    parser.add_argument(
        "--update_config_test",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")
    #
    parser.add_argument("--cache_path",
                        default=None,
                        help="Schema cache directory path")
    parser.add_argument(
        "--encoding_types",
        default=None,
        help="Schema encoding (rcsb|json|bson) (comma separated)")
    parser.add_argument(
        "--validation_levels",
        default=None,
        help="Schema validation level (full|min) (comma separated)")
    parser.add_argument("--compare_only",
                        default=False,
                        action="store_true",
                        help="Perform comparison with cached schema")
    #
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument(
        "--mock",
        default=False,
        action="store_true",
        help="Use MOCK repository configuration for dependencies and testing")
    # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    cachePath = args.cache_path
    compareOnly = args.compare_only
    #
    encodingTypes = args.encoding_types.split(
        ",") if args.encoding_types else []
    validationLevels = args.validation_levels.split(
        ",") if args.validation_levels else []
    dataTypingList = ["ANY", "SQL"]

    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)
    #
    databaseNameList = []
    if args.update_chem_comp_ref:
        databaseNameList.append("chem_comp")

    if args.update_bird_chem_comp_ref:
        databaseNameList.append("bird_chem_comp")

    if args.update_chem_comp_core_ref:
        databaseNameList.append("chem_comp_core")

    if args.update_bird_chem_comp_core_ref:
        databaseNameList.append("bird_chem_comp_core")

    if args.update_bird_ref:
        databaseNameList.append("bird")

    if args.update_bird_family_ref:
        databaseNameList.append("bird_family")

    if args.update_pdbx:
        databaseNameList.append("pdbx")

    if args.update_pdbx_core:
        databaseNameList.append("pdbx_core")

    if args.update_pdbx_comp_model_core:
        databaseNameList.append("pdbx_comp_model_core")

    if args.update_repository_holdings:
        databaseNameList.append("repository_holdings")

    if args.update_entity_sequence_clusters:
        databaseNameList.append("sequence_clusters")

    if args.update_data_exchange:
        databaseNameList.append("data_exchange")

    if args.update_ihm_dev:
        databaseNameList.append("ihm_dev")

    if args.update_drugbank_core:
        databaseNameList.append("drugbank_core")

    if args.update_config_deployed:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_DEPLOYED",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_DEPLOYED",
            sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_DEPLOYED",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_DEPLOYED",
            sectionName="database_catalog_configuration")

    if args.update_config_all:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_ALL", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_ALL",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration")

    if args.update_config_test:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_TEST",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_TEST", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
    #
    scnD = cfgOb.get("document_collection_names",
                     sectionName="document_helper_configuration")
    #
    databaseNameList = list(set(databaseNameList))
    logger.debug("Collections %s", list(scnD.items()))
    logger.debug("databaseNameList %s", databaseNameList)

    if compareOnly:
        schP = SchemaProvider(cfgOb, cachePath, useCache=True)
        difPathList = []
        for databaseName in databaseNameList:
            for dataTyping in dataTypingList:
                logger.debug("Building schema %s with types %s", databaseName,
                             dataTyping)
                pth = schP.schemaDefCompare(databaseName, dataTyping)
                if pth:
                    difPathList.append(pth)
        if difPathList:
            logger.info("Schema definition difference path list %r",
                        difPathList)
        difPathList = []
        for databaseName in databaseNameList:
            dD = schP.makeSchemaDef(databaseName,
                                    dataTyping="ANY",
                                    saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in encodingTypes:
                    if encodingType.lower() != "json":
                        continue
                    for level in validationLevels:
                        pth = schP.jsonSchemaCompare(databaseName,
                                                     collectionName,
                                                     encodingType, level)
                        if pth:
                            difPathList.append(pth)
        if difPathList:
            logger.info("JSON schema difference path list %r", difPathList)

    else:
        schP = SchemaProvider(cfgOb, cachePath, useCache=False)
        for databaseName in databaseNameList:
            for encodingType in encodingTypes:
                if encodingType == "rcsb":
                    for dataTyping in dataTypingList:
                        logger.info(
                            "Creating schema definition for content type %s data typing %s",
                            databaseName, dataTyping)
                        schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=True)
                else:
                    if databaseName in scnD:
                        for dD in scnD[databaseName]:
                            collectionName = dD["NAME"]
                            for validationLevel in validationLevels:
                                logger.info(
                                    "Creating %r schema for content type %s collection %s",
                                    encodingType, databaseName, collectionName)
                                schP.makeSchema(databaseName,
                                                collectionName,
                                                encodingType=encodingType,
                                                level=validationLevel,
                                                saveSchema=True)
class DictMethodRunnerTests(unittest.TestCase):
    def setUp(self):
        self.__export = True
        self.__numProc = 2
        self.__fileLimit = 200
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__testCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": 50,
                "mergeContent": ["vrpt"]
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": 17,
                "mergeContent": None
            },
        ]
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __runContentType(self, contentType, mockLength, mergeContent):
        """Read and process test fixture data files from the input content type."""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContent)
            containerList = self.__rpP.getContainerList(locatorObjList)
            #
            logger.debug("Length of locator list %d\n", len(locatorObjList))
            self.assertGreaterEqual(len(locatorObjList), mockLength)
            for container in containerList:
                cName = container.getName()
                #
                # if cName not in ["1B5F"]:
                #    continue
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMethodRunner(self):
        """Test method runner for multiple content types."""
        for tD in self.__testCaseList:
            self.__runContentType(tD["contentType"], tD["mockLength"],
                                  tD["mergeContent"])

    def testMethodRunnerSetup(self):
        """Test the setup methods for method runner class"""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName("pdbx")
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            ok = dmh is not None
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #17
0
class ChemRefDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateFull(self):
        self.__validateChemRef("DrugBank", schemaLevel="full")

    def __validateChemRef(self, extResource, schemaLevel="full"):
        eCount = 0
        if extResource == "DrugBank":
            schemaName = "drugbank_core"
            collectionNames = ["drugbank_core"]
            user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME",
                                    sectionName=self.__configName)
            pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD",
                                  sectionName=self.__configName)
            # cacheDir = self.__cfgOb.get("DRUGBANK_CACHE_DIR", sectionName=self.__configName)
            dbP = DrugBankProvider(cachePath=self.__cachePath,
                                   useCache=True,
                                   username=user,
                                   password=pw)
            # idD = dbP.getMapping()
            # crExt = ChemRefExtractor(self.__cfgOb)
            # idD = crExt.getChemCompAccesionMapping(extResource)
            dList = dbP.getDocuments()
            logger.info("Validating %d Drugbank documents", len(dList))
            eCount = self.__validate(schemaName,
                                     collectionNames,
                                     dList,
                                     schemaLevel=schemaLevel)

        return eCount

    def __validate(self,
                   databaseName,
                   collectionNames,
                   dList,
                   schemaLevel="full"):

        eCount = 0
        for collectionName in collectionNames:
            _ = self.__schP.makeSchemaDef(databaseName,
                                          dataTyping="ANY",
                                          saveSchema=True)
            cD = self.__schP.makeSchema(databaseName,
                                        collectionName,
                                        encodingType="JSON",
                                        level=schemaLevel,
                                        saveSchema=True)
            # Raises exceptions for schema compliance.
            Draft4Validator.check_schema(cD)
            #
            valInfo = Draft4Validator(cD, format_checker=FormatChecker())
            for ii, dD in enumerate(dList):
                logger.debug("Database %s collection %s document %d",
                             databaseName, collectionName, ii)
                try:
                    cCount = 0
                    for error in sorted(valInfo.iter_errors(dD), key=str):
                        logger.info(
                            "database %s collection %s path %s error: %s",
                            databaseName, collectionName, error.path,
                            error.message)
                        logger.info(">>> failing object is %r", dD)
                        eCount += 1
                        cCount += 1
                    #
                    logger.debug("database %s collection %s count %d",
                                 databaseName, collectionName, cCount)
                except Exception as e:
                    logger.exception("Validation error %s", str(e))

        return eCount
class DataExchangeStatusLoaderTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(DataExchangeStatusLoaderTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath)
        # self.__cfgOb.dump()
        self.__resourceName = "MONGO_DB"
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 10
        self.__documentLimit = 1000
        #
        # sample data set
        self.__updateId = "2018_23"
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testLoadExchangeStatus(self):
        """ Test case - load data exchange status objects.

        [data_exchange]
        DATABASE_NAME=data_exchange
        DATABASE_VERSION_STRING=v5
        COLLECTION_UPDATE_STATUS=rcsb_data_exchange_status
        COLLECTION_VERSION_STRING=v0_1

        """
        try:
            for ii in range(1, 100):
                collectionName = "my_collection_" + str(ii)
                dList = []
                desp = DataExchangeStatus()
                tS = desp.setStartTime()
                self.assertGreaterEqual(len(tS), 15)
                ok = desp.setObject("my_database", collectionName)
                self.assertTrue(ok)
                ok = desp.setStatus(updateId=None, successFlag="Y")
                self.assertTrue(ok)
                #
                tS = desp.setEndTime()
                self.assertGreaterEqual(len(tS), 15)
                dList.append(desp.getStatus())
                #
                self.assertEqual(len(dList), 1)
                logger.debug("Status record %r", dList[0])

                sectionName = "data_exchange_configuration"
                dl = DocumentLoader(
                    self.__cfgOb,
                    self.__cachePath,
                    self.__resourceName,
                    numProc=self.__numProc,
                    chunkSize=self.__chunkSize,
                    documentLimit=self.__documentLimit,
                    verbose=self.__verbose,
                    readBackCheck=self.__readBackCheck,
                )
                #
                databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
                # collectionVersion = self.__cfgOb.get('COLLECTION_VERSION_STRING', sectionName=sectionName)
                collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName)
                if ii == 1:
                    loadType = "full"
                else:
                    loadType = "append"
                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None)
                self.assertTrue(ok)
                #

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #19
0
    def testReadYamlConfig(self):
        try:
            cfgOb = ConfigUtil(configPath=self.__inpPathConfigYaml,
                               configFormat="yaml",
                               mockTopPath=self.__mockTopPath)
            ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml,
                                    configFormat="yaml")
            self.assertTrue(ok)
            #
            sName = "DEFAULT"
            pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName)
            #
            self.assertEqual(
                pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO"))
            self.assertEqual(
                pathPdbx, os.path.join(self.__mockTopPath,
                                       "MOCK_PDBX_SANDBOX"))

            pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName)

            self.assertEqual(pathBird, "MOCK_BIRD_REPO")
            self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX")
            sName = "Section1"
            #
            helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE",
                                           sectionName=sName)

            tv = helperMethod.echo("test_value")
            self.assertEqual(tv, "test_value")
            #
            tEnv = "TEST_ENV_VAR"
            tVal = "TEST_ENV_VAR_VALUE"
            os.environ[tEnv] = tVal
            eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName)
            self.assertEqual(tVal, eVal)

            ky = "42d13dfc9eb689e48c774aa5af8a7e15dbabcd5041939bef213eb37aed882fd6"
            os.environ["CONFIG_SUPPORT_TOKEN_ENV"] = ky
            #
            un = cfgOb.getSecret("SECRET_TEST_USERNAME",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.getSecret("SECRET_TEST_PASSWORD",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            #
            un = cfgOb.get("_TEST_USERNAME",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.get("_TEST_PASSWORD",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            #
            un = cfgOb.getSecret("_TEST_USERNAME",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.getSecret("_TEST_PASSWORD",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            #
            sName = "Section2"
            un = cfgOb.getSecret("_TEST_USERNAME",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.getSecret("_TEST_PASSWORD",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            # test fallback
            # CLEAR_TEXT_USERNAME: testuser2
            # CLEAR_TEXT_PASSWORD: changeme2
            un = cfgOb.get("_CLEAR_TEXT_USERNAME",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.get("_CLEAR_TEXT_PASSWORD",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser2")
            self.assertEqual(pw, "changeme2")
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}
예제 #20
0
class PdbxLoaderFixture(unittest.TestCase):

    def __init__(self, methodName="runTest"):
        super(PdbxLoaderFixture, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
        # configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example-local.yml")
        # To Do: Investigate why GitUtil sometimes gives divergence error when using 'DISCOVERY_MODE: remote', but not with 'local':
        #            stderr: 'fatal: Need to specify how to reconcile divergent branches.'
        #        Behavior isn't entirely predictable, since it happens sometimes but not all the time.
        #        To fully debug, will need to add more logging statements to GitUtil, StashableBase, & StashUtil (in rcsb.utils.io)
        #        Or, can try to resolve error directly by specifying how to reconcile diverent branches in git.Repo class.
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        #
        self.__resourceName = "MONGO_DB"
        self.__failedFilePath = os.path.join(HERE, "test-output", "failed-list.txt")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 10
        self.__fileLimit = 38
        self.__documentStyle = "rowwise_by_name_with_cardinality"
        #
        self.__birdChemCompCoreIdList = [
            "PRD_000010",
            "PRD_000060",
            "PRD_000220",
            "PRD_000882",
            "PRD_000154",
            "PRD_000877",
            "PRD_000198",
            "PRD_000009",
            "PRD_000979",
            "PRDCC_000010",
            "PRDCC_000220",
            "PRDCC_000882",
            "PRDCC_000154",
            "PRDCC_000198",
            "PRDCC_000009",
            "FAM_000010",
            "FAM_000210",
            "FAM_000220",
            "FAM_000001",
            "FAM_000391",
            "FAM_000093",
            "FAM_000084",
            "FAM_000016",
            "FAM_000336",
            "1G1",
            "2RT",
            "2XL",
            "2XN",
            "ATP",
            "BJA",
            "BM3",
            "CNC",
            "DAL",
            "DDZ",
            "DHA",
            "DSN",
            "GTP",
            "HKL",
            "NAC",
            "NAG",
            "NND",
            "PTR",
            "SEP",
            "SMJ",
            "STL",
            "UNK",
            "UNX",
            "UVL",
        ]
        #
        self.__pdbIdList = [
            "1ah1",
            "1b5f",
            "1bmv",
            "1c58",
            "1dsr",
            "1dul",
            "1kqe",
            "1o3q",
            "1sfo",
            "2hw3",
            "2hyv",
            "2osl",
            "2voo",
            "2wmg",
            "3ad7",
            "3hya",
            "3iyd",
            "3mbg",
            "3rer",
            "3vd8",
            "3vfj",
            "3x11",
            "3ztj",
            "4e2o",
            "4en8",
            "4mey",
            "5eu8",
            "5kds",
            "5tm0",
            "5vh4",
            "5vp2",
            "6fsz",
            "6lu7",
            "6nn7",
            "6q20",
            "6rfk",
            "6rku",
            "6yrq",
        ]
        self.__ldList = [
            {
                "databaseName": "bird_chem_comp_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": None,
                "validationLevel": "min",
                "inputIdCodeList": self.__birdChemCompCoreIdList
            },
            {
                "databaseName": "pdbx_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": ["vrpt"],
                "validationLevel": "min",
                "inputIdCodeList": self.__pdbIdList
            },
            {
                "databaseName": "pdbx_comp_model_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": None,
                "validationLevel": "min",
                "inputIdCodeList": None
            },
        ]
        #
        self.__modelFixture()
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def __modelFixture(self):
        fU = FileUtil()
        modelSourcePath = os.path.join(self.__mockTopPath, "AF")
        for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")):
            fn = os.path.basename(iPath)
            uId = fn.split("-")[1]
            h3 = uId[-2:]
            h2 = uId[-4:-2]
            h1 = uId[-6:-4]
            oPath = os.path.join(self.__cachePath, "computed-models", h1, h2, h3, fn)
            fU.put(iPath, oPath)

    def testPdbxLoader(self):
        #
        for ld in self.__ldList:
            self.__pdbxLoaderWrapper(**ld)

    def __pdbxLoaderWrapper(self, **kwargs):
        """Wrapper for the PDBx loader module"""
        try:
            logger.info("Loading %s", kwargs["databaseName"])
            mw = PdbxLoader(
                self.__cfgOb,
                cachePath=self.__cachePath,
                resourceName=self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                fileLimit=kwargs.get("fileLimit", self.__fileLimit),
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
                maxStepLength=2000,
                useSchemaCache=True,
                rebuildSchemaFlag=False,
            )
            ok = mw.load(
                kwargs["databaseName"],
                collectionLoadList=kwargs["collectionNameList"],
                loadType=kwargs["loadType"],
                inputPathList=None,
                inputIdCodeList=kwargs["inputIdCodeList"],
                styleType=self.__documentStyle,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=self.__failedFilePath,
                saveInputFileListPath=None,
                pruneDocumentSize=None,
                logSize=False,
                validationLevel=kwargs["validationLevel"],
                mergeContentTypes=kwargs["mergeContentTypes"],
                useNameFlag=False,
                providerTypeExclude=self.__excludeType,
                restoreUseGit=True,
                restoreUseStash=False,
            )
            self.assertTrue(ok)
            ok = self.__loadStatus(mw.getLoadStatus())
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __loadStatus(self, statusList):
        sectionName = "data_exchange_configuration"
        dl = DocumentLoader(
            self.__cfgOb,
            self.__cachePath,
            resourceName=self.__resourceName,
            numProc=self.__numProc,
            chunkSize=self.__chunkSize,
            documentLimit=None,
            verbose=self.__verbose,
            readBackCheck=self.__readBackCheck,
        )
        #
        databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
        collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName)
        ok = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None)
        return ok
class EntityPolymerExtractorTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(EntityPolymerExtractorTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        self.__taxonomyDataPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR",
                             sectionName=configName))
        #
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__exdbCacheDirPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        self.__mU = MarshalUtil()
        self.__entryLimitTest = 18
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testExtractEntityPolymers(self):
        """Test case - extract entity polymer info"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs,
                                         entryLimit=self.__entryLimitTest)
            eCount = epe.getEntryCount()
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerFeatures(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            eCount = epe.getEntryCount()
            logger.info("Entry count %d", eCount)
            self.assertGreaterEqual(eCount, self.__entryLimitTest)
            #
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Ref seq count %d", len(unpL))
            self.assertGreaterEqual(len(unpL), 1)
            #
            for entryId in ["3RER"]:
                for entityId in ["1"]:
                    uL = epe.getEntityRefSeqAccessions("UNP", entryId,
                                                       entityId)
                    logger.info("UNP for %s %s %r", entryId, entityId, uL)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testTaxonomyReadCache(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            #
            obsL = []
            tD = epe.getOrigTaxons()
            logger.info("Taxons %d", len(tD))

            tU = TaxonomyProvider(taxDirPath=self.__taxonomyDataPath,
                                  useCache=True)
            #
            for entryId, taxIdL in tD.items():
                for entityId, iTaxId in taxIdL:
                    # logger.info("entryId %r entityId %r taxId %r" % (entryId, entityId, taxId))
                    mTaxId = tU.getMergedTaxId(iTaxId)
                    if iTaxId != mTaxId:
                        obsL.append({
                            "entryId": entryId,
                            "entityId": entityId,
                            "taxId": iTaxId,
                            "replaceTaxId": mTaxId
                        })
            logger.info("Obsolete list length %d", len(obsL))
            self.__mU.doExport(os.path.join(self.__workPath,
                                            "obsolete-taxons.json"),
                               obsL,
                               fmt="json",
                               indent=3)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerReadCache(self):
        """Test case - access cached entity polymer info from test cache"""
        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=self.__exdbCacheDirPath,
                                         useCache=False,
                                         cacheKwargs=self.__cacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            cD = epe.countRefSeqAccessions("UNP")
            self.assertGreaterEqual(len(cD), 2)
            logger.info("UNP reference sequences per entity %r",
                        dict(sorted(cD.items())))
            logger.info("Reference sequences per entity %r",
                        dict(sorted(epe.countRefSeqAccessionAny().items())))
            logger.info("Reference sequences per ref db %r",
                        dict(sorted(epe.countRefSeqAccessionDbType().items())))
            #
            ok = epe.checkRefSeqAlignRange("UNP")
            self.assertTrue(ok)
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Unique UNP reference sequences %d", len(unpL))
            self.assertTrue(ok)
            tD = epe.getUniqueTaxons()
            logger.info("Unique taxons %d", len(tD))
            tD = epe.countRefSeqAccessionByTaxon("UNP")
            logger.info("Unique taxons %d", len(tD))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #22
0
class ReferenceSequenceUtilsTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(ReferenceSequenceUtilsTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__exdbCacheDirPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        # Reference sequence test data cache -
        #
        self.__refDbCachePath = os.path.join(HERE, "test-output",
                                             "unp-data-test-cache.json")
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__useCache = False
        self.__fetchLimit = None
        #
        # Entity polymer extracted data ...
        #
        self.__entryLimit = 500
        #
        self.__mU = MarshalUtil()
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testFetchUnp(self):
        """Test case - extract entity polymer info -"""
        try:
            refDbName = "UNP"
            rsu = ReferenceSequenceUtils(
                self.__cfgOb,
                refDbName,
                exdbDirPath=self.__exdbCacheDirPath,
                cacheKwargs=self.__cacheKwargs,
                useCache=self.__useCache,
                entryLimit=self.__entryLimit,
                fetchLimit=self.__fetchLimit,
            )
            numPrimary, numSecondary, numNone = rsu.getReferenceAccessionAlignSummary(
            )
            self.assertGreaterEqual(numPrimary, 70)
            logger.info("For %r matched primary:  %d secondary: %d none %d",
                        refDbName, numPrimary, numSecondary, numNone)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__numProc = 2
        # self.__fileLimit = 200
        self.__fileLimit = None
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = True
        #
        self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        #self.__extraOpts = None
        # The following for extended parent/child info -
        self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"

        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        self.assertLessEqual(eCount, 1)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]}
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
            logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = True

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        #schemaLevel = "full"
        schemaLevel = "min"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes
                )
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        #for error in sorted(valInfo.iter_errors(dD), key=str):
                        #    logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message)
                        #    logger.debug("Failing document %d : %r", ii, list(dD.items()))
                        #    eCount += 1
                        #    cCount += 1
                        #if cCount > 0:
                        #    logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s", str(e))

        return eCount

    def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files
        """
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif")
                    #self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            docList, containerNameList, _ = sdp.processDocuments(
                containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName
            )

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                # for ii, doc in enumerate(docList[:1]):
                for ii, doc in enumerate(docList):
                    cn = containerNameList[ii]
                    fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS))
                    self.__mU.doExport(fp, [doc], fmt="json", indent=3)
                    logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
예제 #24
0
class PdbxLoaderTests(unittest.TestCase):
    loadLocal = False
    loadModels = True

    def __init__(self, methodName="runTest"):
        super(PdbxLoaderTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        #
        self.__resourceName = "MONGO_DB"
        self.__failedFilePath = os.path.join(HERE, "test-output",
                                             "failed-list.txt")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 10
        self.__fileLimit = None
        self.__documentStyle = "rowwise_by_name_with_cardinality"
        self.__ldList = [
            # {"databaseName": "chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "min"},
            {
                "databaseName": "bird_chem_comp_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": None,
                "validationLevel": "full",
                "updateSchemaOnReplace": False,
                "status": True,
            },
            {
                "databaseName": "bird_chem_comp_core",
                "collectionNameList": None,
                "loadType": "replace",
                "mergeContentTypes": None,
                "validationLevel": "full",
                "updateSchemaOnReplace": True,
                "status": True,
            },
            {
                "databaseName": "pdbx_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": ["vrpt"],
                "validationLevel": "full",
                "updateSchemaOnReplace": False,
                "status": True,
            },
        ]
        self.__ldModelList = [
            {
                "databaseName": "pdbx_comp_model_core",
                "collectionNameList": None,
                "loadType": "full",
                "mergeContentTypes": None,
                "validationLevel": "full",
                "updateSchemaOnReplace": False,
                "status": True,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def __modelFixture(self):
        fU = FileUtil()
        modelSourcePath = os.path.join(self.__mockTopPath, "AF")
        for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")):
            fn = os.path.basename(iPath)
            uId = fn.split("-")[1]
            h3 = uId[-2:]
            h2 = uId[-4:-2]
            h1 = uId[-6:-4]
            oPath = os.path.join(self.__cachePath, "computed-models", h1, h2,
                                 h3, fn)
            fU.put(iPath, oPath)

    @unittest.skipUnless(loadLocal, "Skip local load test")
    def testPdbxLoader(self):
        for ld in self.__ldList:
            self.__pdbxLoaderWrapper(**ld)

    @unittest.skipUnless(loadModels, "Skip model load test")
    def testPdbxCompModelLoader(self):
        self.__modelFixture()  # Comment out for manual testing
        for ld in self.__ldModelList:
            self.__pdbxLoaderWrapper(**ld)

    def __pdbxLoaderWrapper(self, **kwargs):
        """Wrapper for PDBx loader module"""
        try:
            logger.info("Loading %s", kwargs["databaseName"])
            mw = PdbxLoader(
                self.__cfgOb,
                cachePath=self.__cachePath,
                resourceName=self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                fileLimit=None,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
                maxStepLength=2000,
                useSchemaCache=True,
                # rebuildSchemaFlag=False,  # This doesn't work for testing, I think because it's probably copying old schema files from remote repo and using those
                rebuildSchemaFlag=True,
            )
            ok = mw.load(
                kwargs["databaseName"],
                collectionLoadList=kwargs["collectionNameList"],
                loadType=kwargs["loadType"],
                inputPathList=None,
                inputIdCodeList=None,
                styleType=self.__documentStyle,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=self.__failedFilePath,
                saveInputFileListPath=None,
                pruneDocumentSize=None,
                logSize=False,
                validationLevel=kwargs["validationLevel"],
                mergeContentTypes=kwargs["mergeContentTypes"],
                useNameFlag=False,
                updateSchemaOnReplace=kwargs["updateSchemaOnReplace"],
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            self.assertEqual(ok, kwargs["status"])
            ok = self.__loadStatus(mw.getLoadStatus())
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __loadStatus(self, statusList):
        sectionName = "data_exchange_configuration"
        dl = DocumentLoader(
            self.__cfgOb,
            self.__cachePath,
            resourceName=self.__resourceName,
            numProc=self.__numProc,
            chunkSize=self.__chunkSize,
            documentLimit=None,
            verbose=self.__verbose,
            readBackCheck=self.__readBackCheck,
        )
        #
        databaseName = self.__cfgOb.get("DATABASE_NAME",
                                        sectionName=sectionName)
        collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS",
                                          sectionName=sectionName)
        ok = dl.load(
            databaseName,
            collectionName,
            loadType="append",
            documentList=statusList,
            indexAttributeList=["update_id", "database_name", "object_name"],
            keyNames=None)
        return ok