示例#1
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument("--full",
                        default=False,
                        action="store_true",
                        help="Fresh full load in a new tables/collections")
    parser.add_argument(
        "--replace",
        default=False,
        action="store_true",
        help="Load with replacement in an existing table/collection (default)")
    #
    parser.add_argument(
        "--load_chem_comp_ref",
        default=False,
        action="store_true",
        help="Load Chemical Component reference definitions (public subset)")
    parser.add_argument(
        "--load_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Load Chemical Component Core reference definitions (public subset)")
    parser.add_argument(
        "--load_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help=
        "Load Bird Chemical Component reference definitions (public subset)")
    parser.add_argument(
        "--load_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Load Bird Chemical Component Core reference definitions (public subset)"
    )
    parser.add_argument("--load_bird_ref",
                        default=False,
                        action="store_true",
                        help="Load Bird reference definitions (public subset)")
    parser.add_argument(
        "--load_bird_family_ref",
        default=False,
        action="store_true",
        help="Load Bird Family reference definitions (public subset)")
    parser.add_argument("--load_entry_data",
                        default=False,
                        action="store_true",
                        help="Load PDBx entry data (current released subset)")
    parser.add_argument(
        "--load_pdbx_core",
        default=False,
        action="store_true",
        help="Load all PDBx core collections (current released subset)")
    parser.add_argument(
        "--load_pdbx_core_merge",
        default=False,
        action="store_true",
        help=
        "Load all PDBx core collections with merged content (current released subset)"
    )
    #
    parser.add_argument("--load_pdbx_core_entry",
                        default=False,
                        action="store_true",
                        help="Load PDBx core entry (current released subset)")
    parser.add_argument("--load_pdbx_core_entity",
                        default=False,
                        action="store_true",
                        help="Load PDBx core entity (current released subset)")
    parser.add_argument(
        "--load_pdbx_core_entity_monomer",
        default=False,
        action="store_true",
        help="Load PDBx core entity monomer (current released subset)")
    parser.add_argument(
        "--load_pdbx_core_assembly",
        default=False,
        action="store_true",
        help="Load PDBx core assembly (current released subset)")
    parser.add_argument(
        "--load_ihm_dev",
        default=False,
        action="store_true",
        help="Load I/HM DEV model data (current released subset)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")

    parser.add_argument("--db_type",
                        default="mongo",
                        help="Database server type (default=mongo)")

    parser.add_argument(
        "--document_style",
        default="rowwise_by_name_with_cardinality",
        help=
        "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name",
    )
    parser.add_argument("--read_back_check",
                        default=False,
                        action="store_true",
                        help="Perform read back check on all documents")
    parser.add_argument("--schema_level",
                        default=None,
                        help="Schema validation level (full|min default=None)")
    #
    parser.add_argument(
        "--load_file_list_path",
        default=None,
        help=
        "Input file containing load file path list (override automatic repository scan)"
    )
    parser.add_argument(
        "--fail_file_list_path",
        default=None,
        help="Output file containing file paths that fail to load")
    parser.add_argument(
        "--save_file_list_path",
        default=None,
        help="Save repo file paths from automatic file system scan in this path"
    )

    parser.add_argument("--num_proc",
                        default=2,
                        help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size",
                        default=10,
                        help="Number of files loaded per process")
    parser.add_argument("--file_limit",
                        default=None,
                        help="Load file limit for testing")
    parser.add_argument("--prune_document_size",
                        default=None,
                        help="Prune large documents to this size limit (MB)")
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument("--mock",
                        default=False,
                        action="store_true",
                        help="Use MOCK repository configuration for testing")
    parser.add_argument("--cache_path",
                        default=None,
                        help="Cache path for resource files")
    parser.add_argument("--rebuild_cache",
                        default=False,
                        action="store_true",
                        help="Rebuild cached resource files")
    parser.add_argument("--rebuild_schema",
                        default=False,
                        action="store_true",
                        help="Rebuild schema on-the-fly if not cached")
    parser.add_argument("--vrpt_repo_path",
                        default=None,
                        help="Path to validation report repository")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)

        #
        if args.vrpt_repo_path:
            vrptPath = args.vrpt_repo_path
            if not os.access(vrptPath, os.R_OK):
                logger.error("Unreadable validation report repository path %r",
                             vrptPath)
            envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName)
            os.environ[envName] = vrptPath
            logger.info("Using alternate validation report path %s",
                        os.getenv(envName))

    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)

    #
    try:
        readBackCheck = args.read_back_check
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        fileLimit = int(args.file_limit) if args.file_limit else None
        failedFilePath = args.fail_file_list_path
        fPath = args.load_file_list_path
        schemaLevel = args.schema_level if args.schema_level in [
            "min", "full", "minimum"
        ] else None
        loadType = "full" if args.full else "replace"
        loadType = "replace" if args.replace else "full"
        saveInputFileListPath = args.save_file_list_path
        pruneDocumentSize = float(
            args.prune_document_size) if args.prune_document_size else None
        cachePath = args.cache_path if args.cache_path else "."
        cachePath = os.path.abspath(cachePath)
        rebuildCache = args.rebuild_cache if args.rebuild_cache else False
        rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False
        if args.document_style not in [
                "rowwise_by_name", "rowwise_by_name_with_cardinality",
                "columnwise_by_name", "rowwise_by_id", "rowwise_no_name"
        ]:
            logger.error("Unsupported document style %s", args.document_style)
        if args.db_type != "mongo":
            logger.error("Unsupported database server type %s", args.db_type)
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)

    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #  Rebuild or check resource cache
    okS = True
    ok = buildResourceCache(cfgOb,
                            configName,
                            cachePath,
                            rebuildCache=rebuildCache)
    if not ok:
        logger.error("Cache rebuild or check failure (rebuild %r) %r",
                     rebuildCache, cachePath)
        exit(1)

    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    # Read any input path lists -
    #
    inputPathList = None
    if fPath:
        mu = MarshalUtil(workPath=cachePath)
        inputPathList = mu.doImport(fPath, fmt="list")
        if not inputPathList:
            logger.error("Missing or empty input file path list %s", fPath)
            exit(1)
    #
    ##
    if args.db_type == "mongo":
        mw = PdbxLoader(
            cfgOb,
            cachePath,
            resourceName="MONGO_DB",
            numProc=numProc,
            chunkSize=chunkSize,
            fileLimit=fileLimit,
            verbose=debugFlag,
            readBackCheck=readBackCheck,
            rebuildSchemaFlag=rebuildSchemaFlag,
        )

        if args.load_chem_comp_ref:
            ok = mw.load(
                "chem_comp",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_chem_comp_core_ref:
            ok = mw.load(
                "chem_comp_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_chem_comp_ref:
            ok = mw.load(
                "bird_chem_comp",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_chem_comp_core_ref:
            ok = mw.load(
                "bird_chem_comp_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_ref:
            ok = mw.load(
                "bird",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_bird_family_ref:
            ok = mw.load(
                "bird_family",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["BIRD_FAMILY_PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_entry_data:
            ok = mw.load(
                "pdbx",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_pdbx_core:
            ok = mw.load(
                "pdbx_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_merge:
            ok = mw.load(
                "pdbx_core",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
                mergeContentTypes=["vrpt"],
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_entity:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_entity"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_entity_monomer:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_entity_monomer"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        if args.load_pdbx_core_entry:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_entry"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_pdbx_core_assembly:
            ok = mw.load(
                "pdbx_core",
                collectionLoadList=["pdbx_core_assembly"],
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.load_ihm_dev:
            ok = mw.load(
                "ihm_dev",
                loadType=loadType,
                inputPathList=inputPathList,
                styleType=args.document_style,
                dataSelectors=["PUBLIC_RELEASE"],
                failedFilePath=failedFilePath,
                saveInputFileListPath=saveInputFileListPath,
                pruneDocumentSize=pruneDocumentSize,
                validationLevel=schemaLevel,
            )
            okS = loadStatus(mw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)
        #
        logger.info("Operation completed with status %r " % ok and okS)
示例#2
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--full",
        default=True,
        action="store_true",
        help="Fresh full load in a new tables/collections (Default)")
    #
    parser.add_argument("--etl_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="ETL entity sequence clusters")
    parser.add_argument("--etl_repository_holdings",
                        default=False,
                        action="store_true",
                        help="ETL repository holdings")
    # parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data")
    # parser.add_argument("--etl_tree_node_lists", default=False, action='store_true', help="ETL tree node lists")

    parser.add_argument(
        "--data_set_id",
        default=None,
        help="Data set identifier (default= 2018_14 for current week)")
    #
    parser.add_argument(
        "--sequence_cluster_data_path",
        default=None,
        help="Sequence cluster data path (default set by configuration")
    parser.add_argument(
        "--sandbox_data_path",
        default=None,
        help="Date exchange sandboxPath data path (default set by configuration"
    )

    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")

    parser.add_argument("--db_type",
                        default="mongo",
                        help="Database server type (default=mongo)")

    # parser.add_argument("--document_style", default="rowwise_by_name_with_cardinality",
    #                    help="Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name")
    parser.add_argument("--read_back_check",
                        default=False,
                        action="store_true",
                        help="Perform read back check on all documents")
    #
    parser.add_argument("--num_proc",
                        default=2,
                        help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size",
                        default=10,
                        help="Number of files loaded per process")
    parser.add_argument("--document_limit",
                        default=None,
                        help="Load document limit for testing")
    parser.add_argument("--prune_document_size",
                        default=None,
                        help="Prune large documents to this size limit (MB)")
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument("--mock",
                        default=False,
                        action="store_true",
                        help="Use MOCK repository configuration for testing")
    parser.add_argument("--cache_path",
                        default=None,
                        help="Path containing cache directories")
    # parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources")
    parser.add_argument("--rebuild_cache",
                        default=False,
                        action="store_true",
                        help="Rebuild cached resource files")
    # parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached")
    #
    #
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    # useCache = args.use_cache
    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
        #
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)

    #
    try:
        readBackCheck = args.read_back_check
        tU = TimeUtil()
        dataSetId = args.data_set_id if args.data_set_id else tU.getCurrentWeekSignature(
        )
        seqDataLocator = args.sequence_cluster_data_path if args.sequence_cluster_data_path else cfgOb.getPath(
            "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        sandboxPath = args.sandbox_data_path if args.sandbox_data_path else cfgOb.getPath(
            "RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        documentLimit = int(
            args.document_limit) if args.document_limit else None

        loadType = "full" if args.full else "replace"
        # loadType = 'replace' if args.replace else 'full'

        cachePath = args.cache_path if args.cache_path else "."
        rebuildCache = args.rebuild_cache if args.rebuild_cache else False
        # rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False
        #
        # if args.document_style not in ['rowwise_by_name', 'rowwise_by_name_with_cardinality', 'columnwise_by_name', 'rowwise_by_id', 'rowwise_no_name']:
        #    logger.error("Unsupported document style %s" % args.document_style)

        if args.db_type != "mongo":
            logger.error("Unsupported database server type %s", args.db_type)
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #  Rebuild or check resource cache
    ok = buildResourceCache(cfgOb,
                            configName,
                            cachePath,
                            rebuildCache=rebuildCache)
    if not ok:
        logger.error("Cache rebuild or check failure (rebuild %r) %r",
                     rebuildCache, cachePath)
        exit(1)
    ##
    if args.db_type == "mongo":
        if args.etl_entity_sequence_clusters:
            cw = SequenceClustersEtlWorker(cfgOb,
                                           numProc=numProc,
                                           chunkSize=chunkSize,
                                           documentLimit=documentLimit,
                                           verbose=debugFlag,
                                           readBackCheck=readBackCheck,
                                           workPath=cachePath)
            ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
            okS = loadStatus(cw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.etl_repository_holdings:
            rhw = RepoHoldingsEtlWorker(cfgOb,
                                        sandboxPath,
                                        cachePath,
                                        numProc=numProc,
                                        chunkSize=chunkSize,
                                        documentLimit=documentLimit,
                                        verbose=debugFlag,
                                        readBackCheck=readBackCheck)
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = loadStatus(rhw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        logger.info("Operation completed with status %r " % ok and okS)
示例#3
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--update_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_chem_comp_core_ref",
        default=False,
        action="store_true",
        help="Update core schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Update core schema for Bird Chemical Component reference definitions")

    parser.add_argument("--update_bird_ref",
                        default=False,
                        action="store_true",
                        help="Update schema for Bird reference definitions")
    parser.add_argument(
        "--update_bird_family_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Family reference definitions")

    parser.add_argument("--update_pdbx",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx entry data")
    parser.add_argument("--update_pdbx_core",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx core entry/entity data")
    parser.add_argument(
        "--update_pdbx_comp_model_core",
        default=False,
        action="store_true",
        help="Update schema for PDBx computational model core entry/entity data"
    )
    #
    parser.add_argument("--update_repository_holdings",
                        default=False,
                        action="store_true",
                        help="Update schema for repository holdings")
    parser.add_argument("--update_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="Update schema for entity sequence clusters")
    parser.add_argument("--update_data_exchange",
                        default=False,
                        action="store_true",
                        help="Update schema for data exchange status")
    parser.add_argument("--update_ihm_dev",
                        default=False,
                        action="store_true",
                        help="Update schema for I/HM dev entry data")
    parser.add_argument("--update_drugbank_core",
                        default=False,
                        action="store_true",
                        help="Update DrugBank schema")
    #
    parser.add_argument(
        "--update_config_all",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)")
    parser.add_argument(
        "--update_config_deployed",
        default=False,
        action="store_true",
        help=
        "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)")
    parser.add_argument(
        "--update_config_test",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")
    #
    parser.add_argument("--cache_path",
                        default=None,
                        help="Schema cache directory path")
    parser.add_argument(
        "--encoding_types",
        default=None,
        help="Schema encoding (rcsb|json|bson) (comma separated)")
    parser.add_argument(
        "--validation_levels",
        default=None,
        help="Schema validation level (full|min) (comma separated)")
    parser.add_argument("--compare_only",
                        default=False,
                        action="store_true",
                        help="Perform comparison with cached schema")
    #
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument(
        "--mock",
        default=False,
        action="store_true",
        help="Use MOCK repository configuration for dependencies and testing")
    # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    cachePath = args.cache_path
    compareOnly = args.compare_only
    #
    encodingTypes = args.encoding_types.split(
        ",") if args.encoding_types else []
    validationLevels = args.validation_levels.split(
        ",") if args.validation_levels else []
    dataTypingList = ["ANY", "SQL"]

    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)
    #
    databaseNameList = []
    if args.update_chem_comp_ref:
        databaseNameList.append("chem_comp")

    if args.update_bird_chem_comp_ref:
        databaseNameList.append("bird_chem_comp")

    if args.update_chem_comp_core_ref:
        databaseNameList.append("chem_comp_core")

    if args.update_bird_chem_comp_core_ref:
        databaseNameList.append("bird_chem_comp_core")

    if args.update_bird_ref:
        databaseNameList.append("bird")

    if args.update_bird_family_ref:
        databaseNameList.append("bird_family")

    if args.update_pdbx:
        databaseNameList.append("pdbx")

    if args.update_pdbx_core:
        databaseNameList.append("pdbx_core")

    if args.update_pdbx_comp_model_core:
        databaseNameList.append("pdbx_comp_model_core")

    if args.update_repository_holdings:
        databaseNameList.append("repository_holdings")

    if args.update_entity_sequence_clusters:
        databaseNameList.append("sequence_clusters")

    if args.update_data_exchange:
        databaseNameList.append("data_exchange")

    if args.update_ihm_dev:
        databaseNameList.append("ihm_dev")

    if args.update_drugbank_core:
        databaseNameList.append("drugbank_core")

    if args.update_config_deployed:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_DEPLOYED",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_DEPLOYED",
            sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_DEPLOYED",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_DEPLOYED",
            sectionName="database_catalog_configuration")

    if args.update_config_all:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_ALL", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_ALL",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration")

    if args.update_config_test:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_TEST",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_TEST", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
    #
    scnD = cfgOb.get("document_collection_names",
                     sectionName="document_helper_configuration")
    #
    databaseNameList = list(set(databaseNameList))
    logger.debug("Collections %s", list(scnD.items()))
    logger.debug("databaseNameList %s", databaseNameList)

    if compareOnly:
        schP = SchemaProvider(cfgOb, cachePath, useCache=True)
        difPathList = []
        for databaseName in databaseNameList:
            for dataTyping in dataTypingList:
                logger.debug("Building schema %s with types %s", databaseName,
                             dataTyping)
                pth = schP.schemaDefCompare(databaseName, dataTyping)
                if pth:
                    difPathList.append(pth)
        if difPathList:
            logger.info("Schema definition difference path list %r",
                        difPathList)
        difPathList = []
        for databaseName in databaseNameList:
            dD = schP.makeSchemaDef(databaseName,
                                    dataTyping="ANY",
                                    saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in encodingTypes:
                    if encodingType.lower() != "json":
                        continue
                    for level in validationLevels:
                        pth = schP.jsonSchemaCompare(databaseName,
                                                     collectionName,
                                                     encodingType, level)
                        if pth:
                            difPathList.append(pth)
        if difPathList:
            logger.info("JSON schema difference path list %r", difPathList)

    else:
        schP = SchemaProvider(cfgOb, cachePath, useCache=False)
        for databaseName in databaseNameList:
            for encodingType in encodingTypes:
                if encodingType == "rcsb":
                    for dataTyping in dataTypingList:
                        logger.info(
                            "Creating schema definition for content type %s data typing %s",
                            databaseName, dataTyping)
                        schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=True)
                else:
                    if databaseName in scnD:
                        for dD in scnD[databaseName]:
                            collectionName = dD["NAME"]
                            for validationLevel in validationLevels:
                                logger.info(
                                    "Creating %r schema for content type %s collection %s",
                                    encodingType, databaseName, collectionName)
                                schP.makeSchema(databaseName,
                                                collectionName,
                                                encodingType=encodingType,
                                                level=validationLevel,
                                                saveSchema=True)
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument("--scanType", default="full", help="Repository scan type (full|incr)")
    #
    group = parser.add_mutually_exclusive_group()
    group.add_argument("--scan_chem_comp_ref", default=False, action="store_true", help="Scan Chemical Component reference definitions (public subset)")
    group.add_argument("--scan_chem_comp_core_ref", default=False, action="store_true", help="Scan Chemical Component Core reference definitions (public subset)")
    group.add_argument("--scan_bird_chem_comp_ref", default=False, action="store_true", help="Scan Bird Chemical Component reference definitions (public subset)")
    group.add_argument("--scan_bird_chem_comp_core_ref", default=False, action="store_true", help="Scan Bird Chemical Component Core reference definitions (public subset)")
    group.add_argument("--scan_bird_ref", default=False, action="store_true", help="Scan Bird reference definitions (public subset)")
    group.add_argument("--scan_bird_family_ref", default=False, action="store_true", help="Scan Bird Family reference definitions (public subset)")
    group.add_argument("--scan_entry_data", default=False, action="store_true", help="Scan PDB entry data (current released subset)")
    group.add_argument("--scan_obsolete_entry_data", default=False, action="store_true", help="Scan obsolete PDB entry data")
    group.add_argument("--scan_comp_model_data", default=False, action="store_true", help="Scan computational model files (mock-data subset)")
    group.add_argument("--scan_ihm_dev", default=False, action="store_true", help="Scan PDBDEV I/HM entry data (current released subset)")
    #
    parser.add_argument("--config_path", default=None, help="Path to configuration options file")
    parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name")

    parser.add_argument("--input_file_list_path", default=None, help="Input file containing file paths to scan")
    parser.add_argument("--output_file_list_path", default=None, help="Output file containing file paths scanned")
    parser.add_argument("--fail_file_list_path", default=None, help="Output file containing file paths that fail scan")
    parser.add_argument("--scan_data_file_path", default=None, help="Output working file storing scan data (Pickle)")
    parser.add_argument("--coverage_file_path", default=None, help="Coverage map (JSON) output path")
    parser.add_argument("--coverage_item_file_path", default=None, help="Coverage by item (tdd) output path")
    parser.add_argument("--type_map_file_path", default=None, help="Type map (JSON) output path")

    parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process")
    parser.add_argument("--file_limit", default=None, help="Load file limit for testing")
    parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging")
    parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing")
    parser.add_argument("--cache_path", default=None, help="Cache path and working direcory for temporary files")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuration path %s (%s)", configPath, configName)
        else:
            logger.error("Missing or access issue with config file %r", configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s", configPath, str(e))
        exit(1)

    #
    try:
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        fileLimit = int(args.file_limit) if args.file_limit else None
        #
        failedFilePath = args.fail_file_list_path

        scanType = args.scanType
        #
        inputFileListPath = args.input_file_list_path
        outputFileListPath = args.output_file_list_path
        scanDataFilePath = args.scan_data_file_path
        dataCoverageFilePath = args.coverage_file_path
        dataCoverageItemFilePath = args.coverage_item_file_path
        dataTypeFilePath = args.type_map_file_path
        cachePath = args.cache_path if args.cache_path else "."
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #
    # Read any input path lists -
    #
    inputPathList = None
    if inputFileListPath:
        mu = MarshalUtil(workPath=cachePath)
        inputPathList = mu.doImport(inputFileListPath, fmt="list")
    #
    ##

    if args.scan_chem_comp_ref:
        contentType = "chem_comp_core"

    elif args.scan_chem_comp_core_ref:
        contentType = "chem_comp_core"

    elif args.scan_bird_chem_comp_ref:
        contentType = "bird_chem_comp_core"

    elif args.scan_bird_chem_comp_core_ref:
        contentType = "bird_chem_comp_core"

    elif args.scan_bird_ref:
        contentType = "bird"

    elif args.scan_bird_family_ref:
        contentType = "bird_family"

    elif args.scan_entry_data:
        contentType = "pdbx"

    elif args.scan_obsolete_entry_data:
        contentType = "pdbx_obsolete"

    elif args.scan_comp_model_data:
        contentType = "pdbx_comp_model_core"

    elif args.scan_ihm_dev:
        contentType = "ihm_dev"

    ok = scanRepo(
        cfgOb,
        contentType,
        scanDataFilePath,
        numProc,
        chunkSize,
        fileLimit,
        scanType=scanType,
        inputPathList=inputPathList,
        pathListFilePath=outputFileListPath,
        dataCoverageFilePath=dataCoverageFilePath,
        dataCoverageItemFilePath=dataCoverageItemFilePath,
        dataTypeFilePath=dataTypeFilePath,
        failedFilePath=failedFilePath,
        cachePath=cachePath,
    )

    logger.info("Operation completed with status %r", ok)
class ReferenceSequenceUtilsTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(ReferenceSequenceUtilsTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        # Caution: this is very site specific setting !
        configName = "site_info_remote"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        if configName != "site_info_configuration":
            self.__cfgOb.replaceSectionName("site_info_configuration",
                                            configName)
        #
        self.__workPath = os.path.join(HERE, "test-cache-preserve")
        #
        self.__entityPolymerCachePath = os.path.join(
            self.__workPath, "entity-polymer-data-cache.pic")
        self.__entityPolymerCacheKwargs = {"fmt": "pickle"}
        self.__useEntityPolymerCache = True
        #
        self.__refDbCachePath = os.path.join(self.__workPath,
                                             "unp-data-test-cache.json")
        self.__refDbCacheKwargs = {"fmt": "json", "indent": 3}
        #
        self.__refDbUseCache = True
        self.__fetchLimit = 500
        #
        self.__mU = MarshalUtil()
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testUpdateUniProtCache(self):
        """Test case - extract entity polymer info and update reference sequence cache"""
        try:
            refDbName = "UNP"
            rsu = ReferenceSequenceUtils(
                self.__cfgOb,
                refDbName,
                referenceCachePath=self.__refDbCachePath,
                referenceCacheKwargs=self.__refDbCacheKwargs,
                useReferenceCache=self.__refDbUseCache,
                entityPolymerCachePath=self.__entityPolymerCachePath,
                entityPolymerCacheKwargs=self.__entityPolymerCacheKwargs,
                useEntityPolymerCache=self.__useEntityPolymerCache,
                fetchLimit=self.__fetchLimit,
            )
            numPrimary, numSecondary, numNone = rsu.getReferenceAccessionAlignSummary(
            )
            self.assertGreaterEqual(numPrimary, 70)
            logger.info("For %r matched primary:  %d secondary: %d none %d",
                        refDbName, numPrimary, numSecondary, numNone)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class EntityPolymerExtractorFullTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(EntityPolymerExtractorFullTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        # Caution: this is very site specific setting
        #
        configName = "site_info_remote"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        if configName != "site_info_configuration":
            self.__cfgOb.replaceSectionName("site_info_configuration",
                                            configName)
        #
        #
        self.__workPath = os.path.join(HERE, "test-cache-preserve")
        #
        self.__fullCacheKwargs = {"fmt": "pickle"}
        self.__fullEntitySaveCachePath = os.path.join(
            self.__workPath, "entity-polymer-data-cache.pic")
        #
        self.__mU = MarshalUtil()
        self.__entryLimitFull = 50
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)\n", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    @unittest.skip("rebuild cache")
    def testRebuildCache(self):
        """Test case - extract entity polymer info - rebuild full cache of extracted entity polymer data -"""
        try:
            epe = EntityPolymerExtractor(
                self.__cfgOb,
                saveCachePath=self.__fullEntitySaveCachePath,
                useCache=False,
                saveCacheKwargs=self.__fullCacheKwargs,
                entryLimit=self.__entryLimitFull)
            eCount = epe.getEntryCount()
            if self.__entryLimitFull is not None:
                self.assertGreaterEqual(eCount, self.__entryLimitFull)
            else:
                self.assertGreaterEqual(eCount, 10)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerFeatures(self):
        """Test case - access cached entity polymer info from full cache"""
        try:
            epe = EntityPolymerExtractor(
                self.__cfgOb,
                saveCachePath=self.__fullEntitySaveCachePath,
                useCache=True,
                saveCacheKwargs=self.__fullCacheKwargs)
            eCount = epe.getEntryCount()
            logger.info("Entry count %d", eCount)
            self.assertGreaterEqual(eCount, self.__entryLimitFull)
            #
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Ref seq count %d", len(unpL))
            self.assertGreaterEqual(len(unpL), 1)
            #
            testOp = False
            if testOp:
                for entryId in ["1CP9"]:
                    for entityId in ["1", "2"]:
                        uL = epe.getEntityRefSeqAccessions(
                            "UNP", entryId, entityId)
                        logger.debug("UNP for %s %s %r", entryId, entityId, uL)
                #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testAccessEntityPolymerReadCache(self):
        """Test case - access cached entity polymer info from full cache"""
        try:
            epe = EntityPolymerExtractor(
                self.__cfgOb,
                saveCachePath=self.__fullEntitySaveCachePath,
                useCache=True,
                saveCacheKwargs=self.__fullCacheKwargs)
            logger.info("Cache entry count %d", epe.getEntryCount())
            cD = epe.countRefSeqAccessions("UNP")
            self.assertGreaterEqual(len(cD), 2)
            #
            logger.info("UNP reference sequences per entity %r",
                        dict(sorted(cD.items())))
            logger.info("Reference sequences per entity %r",
                        dict(sorted(epe.countRefSeqAccessionAny().items())))
            logger.info("Reference sequences per ref db %r",
                        dict(sorted(epe.countRefSeqAccessionDbType().items())))
            #
            ok = epe.checkRefSeqAlignRange("UNP")
            self.assertTrue(ok)
            unpL = epe.getRefSeqAccessions("UNP")
            logger.info("Unique UNP reference sequences %d", len(unpL))
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testTaxonomyEntityPolymerReadCache(self):
        """Test case - evaluate taxonomy - from full cache"""
        try:
            taxIdList = [562, 9606, 3701]
            for taxId in taxIdList:
                tU = TaxonomyUtils(taxDirPath=self.__workPath)
                tL = tU.getLineage(taxId)
                logger.info("Taxonomy lineage for %d %r", taxId, tL)
                #
                #
                epe = EntityPolymerExtractor(
                    self.__cfgOb,
                    saveCachePath=self.__fullEntitySaveCachePath,
                    useCache=True,
                    saveCacheKwargs=self.__fullCacheKwargs)
                logger.info("Cache entry count %d", epe.getEntryCount())
                logger.info(
                    "Reference sequences per ref db %r",
                    dict(sorted(epe.countRefSeqAccessionDbType().items())))
                rD = epe.countRefSeqAccessionByTaxon(dbNameList=["UNP"])
                logger.info("Unique taxons %d", len(list(rD.keys())))
                #
                numT = 0
                for tId, aL in rD.items():
                    tL = tU.getLineage(tId)
                    if taxId in tL:
                        tc = len(set(aL))
                        logger.info("Matched %5d %s (%r)", tc,
                                    tU.getScientificName(tId), tId)
                        numT += tc
                logger.info("Total matched accessions %d ", numT)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()