示例#1
0
 def test_extract_bag_archive_zip_with_relocate_existing(self):
     logger.info(self.getTestHeader('extract bag zip format, relocate existing'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
         output = self.stream.getvalue()
         self.assertExpectedMessages(["moving existing directory"], output)
     except Exception as e:
         self.fail(get_typed_exception(e))
示例#2
0
    def parse(self, bag_archive, output_path="out"):
        """ Analyze the bag, consuming BagIt-RO metadata into a structure downstream code emitters can use. """
        manifest = {}
        """ Extract the bag. """
        bag_path = bdbag_api.extract_bag(bag_archive, output_path=output_path)
        if bdbag_api.is_bag(bag_path):

            logger.debug("Initializing metadata datasets")
            manifest['path'] = bag_path
            manifest['datasets'] = {}
            datasets = manifest['datasets']
            data_path = os.path.join(bag_path, "data")
            """ Extract tarred files. """
            tar_data_files = glob.glob(os.path.join(data_path, "*.csv.gz"))
            for f in tar_data_files:
                with gzip.open(f, 'rb') as zipped:
                    extracted = f.replace(".gz", "")
                    with open(extracted, "wb") as stream:
                        file_content = zipped.read()
                        stream.write(file_content)
            """ Collect metadata for each file. """
            data_files = glob.glob(os.path.join(data_path, "*.csv"))
            csv_filter = CSVFilter()
            for f in data_files:
                csv_filter.filter_data(f)
                logger.debug(f"  --collecting metadata for: {f}")
                jsonld_context = self._get_jsonld_context(f)
                datasets[f] = jsonld_context
                context = datasets[f]['@context']
                datasets[f]['columns'] = {
                    k: None
                    for k in context if isinstance(context[k], dict)
                }
        return manifest
示例#3
0
def ts_validate(data_path, schema=None):
    """Validate a given TableSchema using frictionless.

    Arguments:
        data_path (str): Path to the TableSchema JSON or BDBag directory
                or BDBag archive to validate.
        schema (str): The schema to validate against. If not provided,
                the data is only validated against the defined TableSchema.
                Default None.

    Returns:
        dict: The validation results.
            is_valid (bool): Is the TableSchema valid?
            raw_errors (list): The raw Exceptions generated from any validation errors.
            error (str): A formatted error message about any validation errors.
    """
    if os.path.isfile(data_path):
        archive_file = data_path
        try:
            data_path = bdbag_api.extract_bag(data_path, temp=True)
        except Exception as e:
            raise InvalidInput("Error extracting %s: %s" % (archive_file, e))
        if not bdbag_api.is_bag(data_path):
            raise InvalidInput(
                "Input %s does not appear to be a valid BDBag. This tool requires a"
                " prepared BDBag archive when invoked on an existing archive file."
                % archive_file)

    # If data_path is a directory, find JSON
    if os.path.isdir(data_path):
        if "data" in os.listdir(data_path):
            data_path = os.path.join(data_path, "data")
        desc_file_list = [
            filename for filename in os.listdir(data_path)
            if filename.endswith(".json") and not filename.startswith(".")
        ]
        if len(desc_file_list) < 1:
            raise ValidationException("No TableSchema JSON file found")
        elif len(desc_file_list) > 1:
            raise ValidationException("Mutiple JSON files found in directory")
        else:
            data_path = os.path.join(data_path, desc_file_list[0])

    # Read into Package
    try:
        pkg = Package(data_path)
        report = validate(pkg, schema=schema)
    except FrictionlessException as e:
        raise ValidationException("Validation error\n%s" % e.error.message)

    if not report.valid:
        if report.errors:
            msg = report.errors[0]['message']
        else:
            for task in report['tasks']:
                if not task.valid:
                    msg = task['resource']['path'] + "\n"
                    msg += task['errors'][0]['message']
        raise ValidationException("Validation error in %s" % msg)
示例#4
0
 def test_extract_bag_archive_tar(self):
     logger.info(self.getTestHeader('extract bag tar format'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
     except Exception as e:
         self.fail(get_typed_exception(e))
示例#5
0
 def test_extract_bag_archive_tar(self):
     logger.info(self.getTestHeader('extract bag tar format'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
示例#6
0
 def test_materialize_from_dir(self):
     logger.info(self.getTestHeader('test materialize from dir'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bag_path = bdb.materialize(self.test_bag_fetch_http_dir)
         self.assertTrue(bdb.is_bag(bag_path))
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
示例#7
0
 def test_materialize_non_bag(self):
     logger.info(self.getTestHeader('test materialize non-bag'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bag_path = bdb.materialize(self.test_data_dir)
         self.assertFalse(bdb.is_bag(bag_path))
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
示例#8
0
 def test_materialize_from_file(self):
     logger.info(self.getTestHeader('test materialize from file'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bag_path = bdb.materialize(
             ospj(self.test_archive_dir, 'test-bag-fetch-http.zip'))
         self.assertTrue(bdb.is_bag(bag_path))
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
示例#9
0
 def test_materialize_from_url(self):
     logger.info(self.getTestHeader('test materialize from URL'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bag_path = bdb.materialize(
             "https://github.com/fair-research/bdbag/raw/master/test/test-data/test-archives/"
             "test-bag.zip")
         self.assertTrue(bdb.is_bag(bag_path))
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
示例#10
0
 def create_file(self) -> Tuple[str, Optional[str]]:
     with TemporaryDirectory() as temp_path:
         bag_path = os.path.join(temp_path, 'manifest')
         os.makedirs(bag_path)
         bdbag_api.make_bag(bag_path)
         with open(os.path.join(bag_path, 'data', 'participants.tsv'), 'w') as samples_tsv:
             self._samples_tsv(samples_tsv)
         bag = bdbag_api.make_bag(bag_path, update=True)  # update TSV checksums
         assert bdbag_api.is_bag(bag_path)
         bdbag_api.validate_bag(bag_path)
         assert bdbag_api.check_payload_consistency(bag)
         temp, temp_path = mkstemp()
         os.close(temp)
         archive_path = bdbag_api.archive_bag(bag_path, 'zip')
         # Moves the bdbag archive out of the temporary directory. This prevents
         # the archive from being deleted when the temporary directory self-destructs.
         os.rename(archive_path, temp_path)
         return temp_path, None
示例#11
0
 def checkIfBag(self):
     if not self.currentPath:
         self.isBag = False
     else:
         if os.path.isdir(self.currentPath):
             QApplication.setOverrideCursor(Qt.WaitCursor)
             self.isBag = bdb.is_bag(self.currentPath)
             QApplication.restoreOverrideCursor()
             if self.isBag:
                 self.updateStatus(
                     "The directory [%s] is a bag." % self.currentPath,
                     True)
             else:
                 self.updateStatus(
                     "The directory [%s] is NOT a bag." % self.currentPath,
                     False)
         else:
             self.isBag = False
示例#12
0
def validate_user_submission(data_path,
                             schema,
                             output_dir=None,
                             delete_dir=False,
                             handle_git_repos=True,
                             bdbag_kwargs=None):
    """
    Arguments:
        data_path (str): The path to the data to ingest into DERIVA. The path can be:
                1) A directory to be formatted into a BDBag
                2) A Git repository to be copied into a BDBag
                3) A premade BDBag directory
                4) A premade BDBag in an archive file
        schema (str): The named schema or schema file link to validate data against.
                Default None, to only validate against the declared TableSchema.
        output_dir (str): The path to create an output directory in. The resulting
                BDBag archive will be named after this directory.
                If not set, the directory will be turned into a BDBag in-place.
                For Git repositories, this is automatically set, but can be overridden.
                If data_path is a file, this has no effect.
                This dir MUST NOT be in the `data_path` directory or any subdirectories.
                Default None.
        delete_dir (bool): Should the output_dir be deleted after submission?
                Has no effect if output_dir is not specified.
                For Git repositories, this is always True.
                Default False.
        handle_git_repos (bool): Should Git repositories be detected and handled?
                When this is False, Git repositories are handled as simple directories
                instead of Git repositories.
                Default True.
        bdbag_kwargs (dict): Extra args to pass to bdbag
    """
    bdbag_kwargs = bdbag_kwargs or {}
    data_path = os.path.abspath(data_path)
    if not os.path.exists(data_path):
        raise FileNotFoundError("Path '{}' does not exist".format(data_path))

    if handle_git_repos:
        logger.debug("Checking for a Git repository")
        # If Git repo, set output_dir appropriately
        try:
            repo = git.Repo(data_path, search_parent_directories=True)
        # Not Git repo
        except git.InvalidGitRepositoryError:
            logger.debug("Not a Git repo")
        # Path not found, turn into standard FileNotFoundError
        except git.NoSuchPathError:
            raise FileNotFoundError(
                "Path '{}' does not exist".format(data_path))
        # Is Git repo
        else:
            logger.debug("Git repo found, collecting metadata")
            # Needs to not have slash at end - is known Git repo already, slash
            # interferes with os.path.basename/dirname
            if data_path.endswith("/"):
                data_path = data_path[:-1]
            # Set output_dir to new dir named with HEAD commit hash
            new_dir_name = "{}_{}".format(os.path.basename(data_path),
                                          str(repo.head.commit))
            output_dir = os.path.join(os.path.dirname(data_path), new_dir_name)
            # Delete temp dir after archival
            delete_dir = True

    # If dir and not already BDBag, make BDBag
    if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path):
        logger.debug("Creating BDBag out of directory '{}'".format(data_path))
        # If output_dir specified, copy data to output dir first
        if output_dir:
            logger.debug("Copying data to '{}' before creating BDBag".format(
                output_dir))
            output_dir = os.path.abspath(output_dir)
            # If shutil.copytree is called when the destination dir is inside the source dir
            # by more than one layer, it will recurse infinitely.
            # (e.g. /source => /source/dir/dest)
            # Exactly one layer is technically okay (e.g. /source => /source/dest),
            # but it's easier to forbid all parent/child dir cases.
            # Check for this error condition by determining if output_dir is a child
            # of data_path.
            if os.path.commonpath([data_path]) == os.path.commonpath(
                [data_path, output_dir]):
                raise ValueError(
                    "The output_dir ('{}') must not be in data_path ('{}')".
                    format(output_dir, data_path))
            try:
                shutil.copytree(data_path, output_dir)
            except FileExistsError:
                raise FileExistsError(
                    ("The output directory must not exist. "
                     "Delete '{}' to submit.\nYou can set delete_dir=True "
                     "to avoid this issue in the future.").format(output_dir))
            # Process new dir instead of old path
            data_path = output_dir
        # If output_dir not specified, never delete data dir
        else:
            delete_dir = False
        # Make bag
        bdbag_api.make_bag(data_path, **bdbag_kwargs)
        if not bdbag_api.is_bag(data_path):
            raise ValueError(
                "Failed to create BDBag from {}".format(data_path))
        logger.debug("BDBag created at '{}'".format(data_path))

    # If dir (must be BDBag at this point), archive
    if os.path.isdir(data_path):
        logger.debug("Archiving BDBag at '{}' using '{}'".format(
            data_path, CONFIG["ARCHIVE_FORMAT"]))
        new_data_path = bdbag_api.archive_bag(data_path,
                                              CONFIG["ARCHIVE_FORMAT"])
        logger.debug("BDBag archived to file '{}'".format(new_data_path))
        # If requested (e.g. Git repo copied dir), delete data dir
        if delete_dir:
            logger.debug("Removing old directory '{}'".format(data_path))
            shutil.rmtree(data_path)
        # Overwrite data_path - don't care about dir for uploading
        data_path = new_data_path

    # Validate TableSchema in BDBag
    logger.debug("Validating TableSchema in BDBag '{}'".format(data_path))
    validation_res = ts_validate(data_path, schema=schema)
    if not validation_res["is_valid"]:
        raise exc.ValidationException(
            "TableSchema invalid due to the following errors: "
            "\n{}\n".format(validation_res["error"]))

    logger.debug("Validation successful")
    return data_path
示例#13
0
def main():

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.path)

    archive = None
    temp_path = None
    error = None
    result = 0

    if not args.quiet:
        sys.stderr.write('\n')

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not (
                (args.validate or args.validate_profile or args.resolve_fetch)
                    and not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             algs=args.checksum,
                             update=args.update,
                             save_manifests=not args.skip_manifests,
                             prune_manifests=args.prune_manifests,
                             metadata=BAG_METADATA if BAG_METADATA else None,
                             metadata_file=args.metadata_file,
                             remote_file_manifest=args.remote_file_manifest,
                             config_file=args.config_file,
                             ro_metadata_file=args.ro_metadata_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile
                  or args.resolve_fetch):
            bdb.extract_bag(path)
            if not args.quiet:
                sys.stderr.write('\n')
            return result

        if args.ro_manifest_generate:
            bdb.generate_ro_manifest(
                path,
                True if args.ro_manifest_generate == "overwrite" else False,
                config_file=args.config_file)

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(
                path,
                force=True if args.resolve_fetch == 'all' else False,
                keychain_file=args.keychain_file,
                config_file=args.config_file,
                filter_expr=args.fetch_filter)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            if args.validate == 'structure':
                bdb.validate_bag_structure(temp_path if temp_path else path)
            else:
                bdb.validate_bag(
                    temp_path if temp_path else path,
                    fast=True if args.validate == 'fast' else False,
                    config_file=args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(
                temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path,
                                           profile)

        if args.revert:
            bdb.revert_bag(path)

    except Exception as e:
        result = 1
        error = "Error: %s" % get_typed_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    if not args.quiet:
        sys.stderr.write('\n')

    return result
示例#14
0
def parse_cli():
    description = 'BDBag utility for working with Bagit/RO archives'

    parser = argparse.ArgumentParser(
        description=description,
        epilog="For more information see: http://github.com/fair-research/bdbag"
    )

    parser.add_argument('--version', action='version', version=VERSION)

    standard_args = parser.add_argument_group('Bag arguments')

    update_arg = "--update"
    standard_args.add_argument(
        update_arg,
        action="store_true",
        help=
        "Update an existing bag dir, regenerating manifests and fetch.txt if necessary."
    )

    revert_arg = "--revert"
    standard_args.add_argument(
        revert_arg,
        action="store_true",
        help=
        "Revert an existing bag directory back to a normal directory, deleting all bag metadata files. "
        "Payload files in the \'data\' directory will be moved back to the directory root, and the \'data\' "
        "directory will be deleted.")

    archiver_arg = "--archiver"
    standard_args.add_argument(
        archiver_arg,
        choices=['zip', 'tar', 'tgz'],
        help="Archive a bag using the specified format.")

    checksum_arg = "--checksum"
    standard_args.add_argument(
        checksum_arg,
        action='append',
        choices=['md5', 'sha1', 'sha256', 'sha512', 'all'],
        help=
        "Checksum algorithm to use: can be specified multiple times with different values. "
        "If \'all\' is specified, every supported checksum will be generated")

    skip_manifests_arg = "--skip-manifests"
    standard_args.add_argument(
        skip_manifests_arg,
        action='store_true',
        help=str(
            "If \'skip-manifests\' is specified in conjunction with %s, only tagfile manifests will be "
            "regenerated, with payload manifests and fetch.txt (if any) left as is. This argument should be used "
            "when only bag metadata has changed." % update_arg))

    prune_manifests_arg = "--prune-manifests"
    standard_args.add_argument(
        prune_manifests_arg,
        action='store_true',
        help=
        "If specified, any existing checksum manifests not explicitly configured via either"
        " the \"checksum\" argument(s) or configuration file will be deleted from the bag during an update."
    )

    fetch_arg = "--resolve-fetch"
    standard_args.add_argument(
        fetch_arg,
        "--fetch",
        choices=['all', 'missing'],
        help="Download remote files listed in the bag's fetch.txt file. "
        "The \"missing\" option only attempts to fetch files that do not "
        "already exist in the bag payload directory. "
        "The \"all\" option causes all fetch files to be re-acquired,"
        " even if they already exist in the bag payload directory.")

    fetch_filter_arg = "--fetch-filter"
    standard_args.add_argument(
        fetch_filter_arg,
        metavar="<column><operator><value>",
        help=
        "A simple expression of the form <column><operator><value> where: <column> is the name of a column in "
        "the bag's fetch.txt to be filtered on, <operator> is one of the following tokens; %s, and <value> is a "
        "string pattern or integer to be filtered against." % FILTER_DOCSTRING)

    validate_arg = "--validate"
    standard_args.add_argument(
        validate_arg,
        choices=['fast', 'full', 'structure'],
        help=
        "Validate a bag directory or bag archive. If \"fast\" is specified, Payload-Oxum (if present) will be "
        "used to check that the payload files are present and accounted for. If \"full\" is specified, "
        "all checksums will be regenerated and compared to the corresponding entries in the manifest. "
        "If \"structure\" is specified, the bag will be checked for structural validity only."
    )

    validate_profile_arg = "--validate-profile"
    standard_args.add_argument(
        validate_profile_arg,
        action="store_true",
        help="Validate a bag against the profile specified by the bag's "
        "\"BagIt-Profile-Identifier\" metadata field, if present.")

    config_file_arg = "--config-file"
    standard_args.add_argument(
        config_file_arg,
        default=DEFAULT_CONFIG_FILE,
        metavar='<file>',
        help=
        "Optional path to a configuration file. If this argument is not specified, the configuration file "
        "defaults to: %s " % DEFAULT_CONFIG_FILE)

    keychain_file_arg = "--keychain-file"
    standard_args.add_argument(
        keychain_file_arg,
        default=DEFAULT_KEYCHAIN_FILE,
        metavar='<file>',
        help=
        "Optional path to a keychain file. If this argument is not specified, the keychain file "
        "defaults to: %s " % DEFAULT_KEYCHAIN_FILE)

    metadata_file_arg = "--metadata-file"
    standard_args.add_argument(
        metadata_file_arg,
        metavar='<file>',
        help="Optional path to a JSON formatted metadata file")

    ro_metadata_file_arg = "--ro-metadata-file"
    standard_args.add_argument(
        ro_metadata_file_arg,
        metavar='<file>',
        help="Optional path to a JSON formatted RO metadata file")

    ro_manifest_generate_arg = "--ro-manifest-generate"
    standard_args.add_argument(
        ro_manifest_generate_arg,
        choices=['overwrite', 'update'],
        help=
        "Automatically generate a basic RO metadata manifest.json file by introspecting a bag's metadata and "
        "structure.")

    remote_file_manifest_arg = "--remote-file-manifest"
    standard_args.add_argument(
        remote_file_manifest_arg,
        metavar='<file>',
        help=
        "Optional path to a JSON formatted remote file manifest configuration file used to add remote file entries"
        " to the bag manifest(s) and create the bag fetch.txt file.")

    standard_args.add_argument('--quiet',
                               action="store_true",
                               help="Suppress logging output.")

    standard_args.add_argument('--debug',
                               action="store_true",
                               help="Enable debug logging output.")

    standard_args.add_argument(
        'path',
        metavar="<path>",
        help="Path to a bag directory or bag archive file.")

    metadata_args = parser.add_argument_group('Bag metadata arguments')
    headers = list(bagit.STANDARD_BAG_INFO_HEADERS)
    headers.append("Contact-Orcid")
    for header in sorted(headers):
        metadata_args.add_argument('--%s' % header.lower(),
                                   action=AddMetadataAction)

    args = parser.parse_args()

    bdb.configure_logging(level=logging.ERROR if args.quiet else (
        logging.DEBUG if args.debug else logging.INFO))

    path = os.path.abspath(args.path)
    if not os.path.exists(path):
        sys.stderr.write("Error: file or directory not found: %s\n\n" % path)
        sys.exit(2)

    is_file = os.path.isfile(path)
    if args.archiver and is_file:
        sys.stderr.write(
            "Error: A bag archive cannot be created from an existing bag archive.\n\n"
        )
        sys.exit(2)

    if args.checksum and is_file:
        sys.stderr.write(
            "Error: A checksum manifest cannot be added to an existing bag archive. "
            "The bag must be extracted, updated, and re-archived.\n\n")
        sys.exit(2)

    if args.update and is_file:
        sys.stderr.write(
            "Error: An existing bag archive cannot be updated in-place. "
            "The bag must first be extracted and then updated.\n\n")
        sys.exit(2)

    if args.revert and is_file:
        sys.stderr.write(
            "Error: An existing bag archive cannot be reverted in-place. "
            "The bag must first be extracted and then reverted.\n\n")
        sys.exit(2)

    if args.fetch_filter and not args.resolve_fetch:
        sys.stderr.write(
            "Error: The %s argument can only be used with the %s argument.\n\n"
            % (fetch_filter_arg, fetch_arg))
        sys.exit(2)

    if args.resolve_fetch and is_file:
        sys.stderr.write(
            "Error: It is not possible to resolve remote files directly into a bag archive. "
            "The bag must first be extracted before the %s argument can be specified.\n\n"
            % fetch_arg)
        sys.exit(2)

    if args.update and args.resolve_fetch:
        sys.stderr.write(
            "Error: The %s argument is not compatible with the %s argument.\n\n"
            % (update_arg, fetch_arg))
        sys.exit(2)

    if args.remote_file_manifest and args.resolve_fetch:
        sys.stderr.write(
            "Error: The %s argument is not compatible with the %s argument.\n\n"
            % (remote_file_manifest_arg, fetch_arg))
        sys.exit(2)

    is_bag = bdb.is_bag(path)
    if args.checksum and not args.update and is_bag:
        sys.stderr.write(
            "Error: Specifying %s for an existing bag requires the %s argument in order "
            "to apply any changes.\n\n" % (checksum_arg, update_arg))
        sys.exit(2)

    if args.remote_file_manifest and not args.update and is_bag:
        sys.stderr.write(
            "Error: Specifying %s for an existing bag requires the %s argument in order "
            "to apply any changes.\n\n" %
            (remote_file_manifest_arg, update_arg))
        sys.exit(2)

    if args.metadata_file and not args.update and is_bag:
        sys.stderr.write(
            "Error: Specifying %s for an existing bag requires the %s argument in order "
            "to apply any changes.\n\n" % (metadata_file_arg, update_arg))
        sys.exit(2)

    if args.ro_metadata_file and not args.update and is_bag:
        sys.stderr.write(
            "Error: Specifying %s for an existing bag requires the %s argument in order "
            "to apply any changes.\n\n" % (ro_metadata_file_arg, update_arg))
        sys.exit(2)

    if args.prune_manifests and not args.update and is_bag:
        sys.stderr.write(
            "Error: Specifying %s for an existing bag requires the %s argument in order "
            "to apply any changes.\n\n" % (prune_manifests_arg, update_arg))
        sys.exit(2)

    if args.skip_manifests and not args.update and is_bag:
        sys.stderr.write("Error: Specifying %s requires the %s argument.\n\n" %
                         (skip_manifests_arg, update_arg))
        sys.exit(2)

    if BAG_METADATA and not args.update and is_bag:
        sys.stderr.write(
            "Error: Adding or modifying metadata %s for an existing bag requires the %s argument "
            "in order to apply any changes.\n\n" % (BAG_METADATA, update_arg))
        sys.exit(2)

    if args.revert and not is_bag:
        sys.stderr.write(
            "Error: The directory %s is not a bag and therefore cannot be reverted.\n\n"
            % path)
        sys.exit(2)

    if args.revert and args.update and is_bag:
        sys.stderr.write(
            "Error: The %s argument is not compatible with the %s argument.\n\n"
            % (revert_arg, update_arg))
        sys.exit(2)

    return args, is_bag, is_file
示例#15
0
def locate_bag(root_dir):
    for dir, subdirs, _ in os.walk(root_dir):
        if bdbag_api.is_bag(dir):
            return dir
示例#16
0
    def restore(self, **kwargs):
        """
        Perform the catalog restore operation. The restore process is broken up into six phases:

        1. Pre-process the input path.
            - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted
            into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`.
            - If the input directory is a valid _bag_ directory structure, the bag will be materialized.
        2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file.
            The schema document file must be named `catalog-schema.json` and must appear at the root of the input
            directory. The restore process can be configured to exclude the restoration of an enumerated set both
            schema and tables.
        3. The catalog table data will be restored, if present. The table date restoration process is resilient to
            interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of
            the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed.
            The restore process can be configured to exclude the restoration of table data for a set of tables.
        4. The catalog foreign keys will be restored.
        5. The catalog assets will be restored, if present.
        6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated.

        :param kwargs:
        :return:
        """
        success = True
        start = datetime.datetime.now()

        # pre-process input
        logging.info("Processing input path: %s" % self.input_path)
        is_file, is_dir, is_uri = bdb.inspect_path(self.input_path)
        if not (is_file or is_dir or is_uri):
            raise DerivaRestoreError(
                "Invalid input path [%s]. If the specified input path refers to a locally mounted "
                "file or directory, it does not exist or cannot be accessed. If the specified "
                "path is a URI, the scheme component of the URI could not be determined."
                % self.input_path)
        if is_file or is_dir:
            self.input_path = os.path.abspath(self.input_path)
        if is_file:
            logging.info(
                "The input path [%s] is a file. Assuming input file is a directory archive and extracting..."
                % self.input_path)
            self.input_path = bdb.extract_bag(self.input_path)

        try:
            if not self.no_bag_materialize:
                self.input_path = bdb.materialize(self.input_path)
        except bdb.bdbagit.BagValidationError as e:
            if self.strict_bag_validation:
                raise DerivaRestoreError(format_exception(e))
            else:
                logging.warning(
                    "Input bag validation failed and strict validation mode is disabled. %s"
                    % format_exception(e))
        is_bag = bdb.is_bag(self.input_path)

        src_schema_file = os.path.abspath(
            os.path.join(self.input_path, "data" if is_bag else "",
                         "catalog-schema.json"))
        # the src_catalog_stub created below will never be "connected" in any kind of network sense,
        # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file.
        src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1")
        src_model = Model.fromfile(src_catalog_stub, src_schema_file)

        # initialize/connect to destination catalog
        if not self.catalog_id:
            self.catalog_id = self.server.create_ermrest_catalog().catalog_id
            self.server_args["catalog_id"] = self.catalog_id
            logging.info("Created new target catalog with ID: %s" %
                         self.catalog_id)
        self.dst_catalog = self.server.connect_ermrest(self.catalog_id)

        # init dcctx cid to a default
        self.dst_catalog.dcctx['cid'] = self.__class__.__name__

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        logging.info("Restoring %s to catalog: %s" %
                     (self.input_path, self.dst_catalog.get_server_uri()))
        # set top-level config right away and find fatal usage errors...
        if self.restore_policy:
            logging.info("Restoring top-level catalog ACLs...")
            if not src_model.acls:
                logging.info("Source schema does not contain any ACLs.")
            else:
                src_model.acls.owner.extend(dst_model.acls.owner)
                self.dst_catalog.put('/acl', json=src_model.acls)

        if self.restore_annotations:
            logging.info("Restoring top-level catalog annotations...")
            self.dst_catalog.put('/annotation', json=src_model.annotations)

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        new_model = []
        new_columns = [
        ]  # ERMrest does not currently allow bulk column creation
        new_keys = []  # ERMrest does not currently allow bulk key creation
        restore_states = {}
        fkeys_deferred = {}
        exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas

        try:
            for sname, schema in src_model.schemas.items():
                if sname in exclude_schemas:
                    continue
                if sname not in dst_model.schemas:
                    new_model.append(self.copy_sdef(schema))

                for tname, table in schema.tables.items():
                    if table.kind != 'table':
                        logging.warning('Skipping restore of %s %s:%s' %
                                        (table.kind, sname, tname))
                        continue

                    if 'RID' not in table.column_definitions.elements:
                        raise DerivaRestoreError(
                            "Source table %s.%s lacks system-columns and cannot be restored."
                            % (sname, tname))

                    # make sure the source table is pruned of any existing restore state markers
                    if table.annotations.get(CLONE_STATE_URL) is not None:
                        del table.annotations[CLONE_STATE_URL]
                    if table.annotations.get(
                            self.RESTORE_STATE_URL) is not None:
                        del table.annotations[self.RESTORE_STATE_URL]

                    if sname not in dst_model.schemas or tname not in dst_model.schemas[
                            sname].tables:
                        new_model.append(self.copy_tdef_core(table))
                        restore_states[(
                            sname, tname)] = 1 if self.restore_data else None
                        fkeys_deferred[(sname,
                                        tname)] = self.copy_tdef_fkeys(table)
                    else:
                        src_columns = {
                            c.name: c
                            for c in table.column_definitions
                        }
                        dst_columns = {
                            c.name: c
                            for c in dst_model.schemas[sname].tables[tname].
                            column_definitions
                        }

                        for cname in src_columns:
                            if cname not in dst_columns:
                                new_columns.append(
                                    self.copy_cdef(src_columns[cname]))
                            else:
                                self.check_column_compatibility(
                                    src_columns[cname], dst_columns[cname])

                        for cname in dst_columns:
                            if cname not in src_columns:
                                raise DerivaRestoreError(
                                    "Destination column %s.%s.%s does not exist in source catalog."
                                    % (sname, tname, cname))

                        src_keys = {
                            tuple(sorted(c.name
                                         for c in key.unique_columns)): key
                            for key in table.keys
                        }
                        dst_keys = {
                            tuple(sorted(c.name
                                         for c in key.unique_columns)): key
                            for key in
                            dst_model.schemas[sname].tables[tname].keys
                        }

                        for utuple in src_keys:
                            if utuple not in dst_keys:
                                new_keys.append(
                                    self.copy_kdef(src_keys[utuple]))

                        for utuple in dst_keys:
                            if utuple not in src_keys:
                                raise DerivaRestoreError(
                                    "Destination key %s.%s(%s) does not exist in source catalog."
                                    % (sname, tname, ', '.join(utuple)))

                        restore_states[(sname, tname)] = \
                            dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL)
                        if dst_model.schemas[sname].tables[tname].foreign_keys:
                            # assume that presence of any destination foreign keys means we already completed
                            if self.restore_assets:
                                self.upload_assets()
                            return
                        else:
                            fkeys_deferred[(
                                sname, tname)] = self.copy_tdef_fkeys(table)

            # apply the stage 1 model to the destination in bulk
            logging.info("Restoring catalog schema...")
            if new_model:
                self.dst_catalog.post("/schema",
                                      json=new_model).raise_for_status()

            for sname, tname, cdef in new_columns:
                self.dst_catalog.post("/schema/%s/table/%s/column" %
                                      (urlquote(sname), urlquote(tname)),
                                      json=cdef).raise_for_status()

            for sname, tname, kdef in new_keys:
                self.dst_catalog.post("/schema/%s/table/%s/key" %
                                      (urlquote(sname), urlquote(tname)),
                                      json=kdef).raise_for_status()

            # copy data in stage 2
            if self.restore_data:
                logging.info("Restoring catalog data...")
                for sname, tname in restore_states.keys():
                    tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname))
                    if restore_states[(sname, tname)] == 1:
                        # determine current position in (partial?) copy
                        row = self.dst_catalog.get(
                            "/entity/%s@sort(RID::desc::)?limit=1" %
                            tname_uri).json()
                        if row:
                            last = row[0]['RID']
                            logging.info(
                                "Existing data detected in table [%s] -- will attempt partial restore of "
                                "remaining records following last known RID: %s"
                                % (tname_uri, last))
                        else:
                            last = None

                        table = self.get_json_recordset(
                            self.open_json_stream_file(
                                self.get_table_path(sname, tname, is_bag)),
                            self.data_chunk_size,
                            after=last)

                        total = 0
                        table_success = True
                        try:
                            for chunk in table:
                                if chunk:
                                    self.dst_catalog.post(
                                        "/entity/%s?nondefaults=RID,RCT,RCB" %
                                        tname_uri,
                                        json=chunk)
                                    total += len(chunk)
                                else:
                                    break
                        except:
                            table_success = False
                        finally:
                            table.close()
                            if table_success:
                                logging.info(
                                    "Restoration of table data [%s] successful. %s rows restored."
                                    % (tname_uri, total))
                            else:
                                logging.warning(
                                    "Restoration of table data [%s] failed. %s rows restored."
                                    % (tname_uri, total))

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2)
                    elif restore_states[(sname, tname)] is None and (
                            sname, tname) in {
                                ('public', 'ERMrest_Client'),
                                ('public', 'ERMrest_Group'),
                            }:
                        # special sync behavior for magic ermrest tables
                        # HACK: these are assumed small enough to join via local merge of arrays
                        want = sorted(self.load_json_file(
                            self.get_table_path(sname, tname, is_bag)),
                                      key=lambda r: r['ID'])
                        have = sorted(self.dst_catalog.get(
                            "/entity/%s?limit=none" % tname_uri).json(),
                                      key=lambda r: r['ID'])
                        create = []
                        update = []

                        pos_want = 0
                        pos_have = 0
                        while pos_want < len(want):
                            while pos_have < len(have) and have[pos_have][
                                    'ID'] < want[pos_want]['ID']:
                                # dst-only rows will be retained as is
                                pos_have += 1
                            if pos_have >= len(have) or have[pos_have][
                                    'ID'] > want[pos_want]['ID']:
                                # src-only rows will be inserted
                                create.append(want[pos_want])
                                pos_want += 1
                            else:
                                # overlapping rows will be updated
                                update.append(want[pos_want])
                                pos_want += 1

                        self.dst_catalog.post(
                            "/entity/%s?nondefaults=RCT,RCB" % tname_uri,
                            json=create)
                        self.dst_catalog.put(
                            "/attributegroup/%s/ID;%s" % (tname_uri, ",".join([
                                urlquote(c.name) for c in src_model.
                                schemas[sname].tables[tname].column_definitions
                                if c.name not in {'RID', 'RMT', 'RMB', 'ID'}
                            ])),
                            json=update)

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2)

            # apply stage 2 model in bulk only... we won't get here unless preceding succeeded
            logging.info("Restoring foreign keys...")
            new_fkeys = []
            for fkeys in fkeys_deferred.values():
                new_fkeys.extend(fkeys)

            # restore fkeys
            if new_fkeys:
                self.dst_catalog.post("/schema", json=new_fkeys)

            # restore assets
            if self.restore_assets:
                self.upload_assets()

            # cleanup
            self.cleanup_restored_catalog()
        except:
            success = False
            raise
        finally:
            elapsed_time = datetime.datetime.now() - start
            total_secs = elapsed_time.total_seconds()
            elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs))
            logging.info("Restore of catalog %s %s. %s" %
                         (self.dst_catalog.get_server_uri(),
                          "completed successfully" if success else "failed",
                          ("Elapsed time: %s" % elapsed) if
                          (total_secs > 0) else ""))
示例#17
0
    def start_deriva_flow(self,
                          data_path,
                          dcc_id,
                          catalog_id=None,
                          schema=None,
                          server=None,
                          dataset_acls=None,
                          output_dir=None,
                          delete_dir=False,
                          handle_git_repos=True,
                          dry_run=False,
                          test_sub=False,
                          verbose=False,
                          **kwargs):
        """Start the Globus Automate Flow to ingest CFDE data into DERIVA.

        Arguments:
            data_path (str): The path to the data to ingest into DERIVA. The path can be:
                    1) A directory to be formatted into a BDBag
                    2) A Git repository to be copied into a BDBag
                    3) A premade BDBag directory
                    4) A premade BDBag in an archive file
            dcc_id (str): The CFDE-recognized DCC ID for this submission.
            catalog_id (int or str): The ID of the DERIVA catalog to ingest into.
                    Default None, to create a new catalog.
            schema (str): The named schema or schema file link to validate data against.
                    Default None, to only validate against the declared TableSchema.
            server (str): The DERIVA server to ingest to.
                    Default None, to use the Action Provider-set default.
            dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset.
                    Default None, to use the CFDE default ACLs.
            output_dir (str): The path to create an output directory in. The resulting
                    BDBag archive will be named after this directory.
                    If not set, the directory will be turned into a BDBag in-place.
                    For Git repositories, this is automatically set, but can be overridden.
                    If data_path is a file, this has no effect.
                    This dir MUST NOT be in the `data_path` directory or any subdirectories.
                    Default None.
            delete_dir (bool): Should the output_dir be deleted after submission?
                    Has no effect if output_dir is not specified.
                    For Git repositories, this is always True.
                    Default False.
            handle_git_repos (bool): Should Git repositories be detected and handled?
                    When this is False, Git repositories are handled as simple directories
                    instead of Git repositories.
                    Default True.
            dry_run (bool): Should the data be validated and bagged without starting the Flow?
                    When True, does not ingest into DERIVA or start the Globus Automate Flow,
                    and the return value will not have valid DERIVA Flow information.
                    Default False.
            test_sub (bool): Should the submission be run in "test mode" where
                    the submission will be inegsted into DERIVA and immediately deleted?
                    When True, the data wil not remain in DERIVA to be viewed and the
                    Flow will terminate before any curation step.
            verbose (bool): Should intermediate status messages be printed out?
                    Default False.

        Keyword Arguments:
            force_http (bool): Should the data be sent using HTTP instead of Globus Transfer,
                    even if Globus Transfer is available? Because Globus Transfer is more
                    robust than HTTP, it is highly recommended to leave this False.
                    Default False.

        Other keyword arguments are passed directly to the ``make_bag()`` function of the
        BDBag API (see https://github.com/fair-research/bdbag for details).
        """
        if verbose:
            print("Startup: Validating input")
        data_path = os.path.abspath(data_path)
        if not os.path.exists(data_path):
            raise FileNotFoundError(
                "Path '{}' does not exist".format(data_path))

        if catalog_id in self.catalogs.keys():
            if schema:
                raise ValueError(
                    "You may not specify a schema ('{}') when ingesting to "
                    "a named catalog ('{}'). Retry without specifying "
                    "a schema.".format(schema, catalog_id))
            schema = self.catalogs[catalog_id]
        # Pull out known kwargs
        force_http = kwargs.pop("force_http", False)

        if handle_git_repos:
            if verbose:
                print("Checking for a Git repository")
            # If Git repo, set output_dir appropriately
            try:
                repo = git.Repo(data_path, search_parent_directories=True)
            # Not Git repo
            except git.InvalidGitRepositoryError:
                if verbose:
                    print("Not a Git repo")
            # Path not found, turn into standard FileNotFoundError
            except git.NoSuchPathError:
                raise FileNotFoundError(
                    "Path '{}' does not exist".format(data_path))
            # Is Git repo
            else:
                if verbose:
                    print("Git repo found, collecting metadata")
                # Needs to not have slash at end - is known Git repo already, slash
                # interferes with os.path.basename/dirname
                if data_path.endswith("/"):
                    data_path = data_path[:-1]
                # Set output_dir to new dir named with HEAD commit hash
                new_dir_name = "{}_{}".format(os.path.basename(data_path),
                                              str(repo.head.commit))
                output_dir = os.path.join(os.path.dirname(data_path),
                                          new_dir_name)
                # Delete temp dir after archival
                delete_dir = True

        # If dir and not already BDBag, make BDBag
        if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path):
            if verbose:
                print("Creating BDBag out of directory '{}'".format(data_path))
            # If output_dir specified, copy data to output dir first
            if output_dir:
                if verbose:
                    print("Copying data to '{}' before creating BDBag".format(
                        output_dir))
                output_dir = os.path.abspath(output_dir)
                # If shutil.copytree is called when the destination dir is inside the source dir
                # by more than one layer, it will recurse infinitely.
                # (e.g. /source => /source/dir/dest)
                # Exactly one layer is technically okay (e.g. /source => /source/dest),
                # but it's easier to forbid all parent/child dir cases.
                # Check for this error condition by determining if output_dir is a child
                # of data_path.
                if os.path.commonpath([data_path]) == os.path.commonpath(
                    [data_path, output_dir]):
                    raise ValueError(
                        "The output_dir ('{}') must not be in data_path ('{}')"
                        .format(output_dir, data_path))
                try:
                    shutil.copytree(data_path, output_dir)
                except FileExistsError:
                    raise FileExistsError(
                        ("The output directory must not exist. "
                         "Delete '{}' to submit.\nYou can set delete_dir=True "
                         "to avoid this issue in the future."
                         ).format(output_dir))
                # Process new dir instead of old path
                data_path = output_dir
            # If output_dir not specified, never delete data dir
            else:
                delete_dir = False
            # Make bag
            bdbag_api.make_bag(data_path, **kwargs)
            if not bdbag_api.is_bag(data_path):
                raise ValueError(
                    "Failed to create BDBag from {}".format(data_path))
            elif verbose:
                print("BDBag created at '{}'".format(data_path))

        # If dir (must be BDBag at this point), archive
        if os.path.isdir(data_path):
            if verbose:
                print("Archiving BDBag at '{}' using '{}'".format(
                    data_path, CONFIG["ARCHIVE_FORMAT"]))
            new_data_path = bdbag_api.archive_bag(data_path,
                                                  CONFIG["ARCHIVE_FORMAT"])
            if verbose:
                print("BDBag archived to file '{}'".format(new_data_path))
            # If requested (e.g. Git repo copied dir), delete data dir
            if delete_dir:
                if verbose:
                    print("Removing old directory '{}'".format(data_path))
                shutil.rmtree(data_path)
            # Overwrite data_path - don't care about dir for uploading
            data_path = new_data_path

        # Validate TableSchema in BDBag
        if verbose:
            print("Validating TableSchema in BDBag '{}'".format(data_path))
        validation_res = ts_validate(data_path, schema=schema)
        if not validation_res["is_valid"]:
            return {
                "success":
                False,
                "error":
                ("TableSchema invalid due to the following errors: \n{}\n".
                 format(validation_res["error"]))
            }
        elif verbose:
            print("Validation successful")

        # Now BDBag is archived file
        # Set path on destination
        dest_path = "{}{}".format(self.flow_info["cfde_ep_path"],
                                  os.path.basename(data_path))

        # If doing dry run, stop here before making Flow input
        if dry_run:
            return {
                "success":
                True,
                "message":
                "Dry run validated successfully. No data was transferred."
            }

        # Set up Flow
        if verbose:
            print("Creating input for Flow")
        # If local EP exists (and not force_http), can use Transfer
        # Local EP fetched now in case GCP started after Client creation
        local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id
        if local_endpoint and not force_http:
            if verbose:
                print(
                    "Using local Globus Connect Personal Endpoint '{}'".format(
                        local_endpoint))
            # Populate Transfer fields in Flow
            flow_id = self.flow_info["flow_id"]
            flow_input = {
                "source_endpoint_id": local_endpoint,
                "source_path": data_path,
                "cfde_ep_id": self.flow_info["cfde_ep_id"],
                "cfde_ep_path": dest_path,
                "cfde_ep_url": self.flow_info["cfde_ep_url"],
                "is_directory": False,
                "test_sub": test_sub,
                "dcc_id": dcc_id
            }
            if catalog_id:
                flow_input["catalog_id"] = str(catalog_id)
            if server:
                flow_input["server"] = server
        # Otherwise, we must PUT the BDBag on the server
        else:
            if verbose:
                print("No Globus Endpoint detected; using HTTP upload instead")
            headers = {}
            self.__https_authorizer.set_authorization_header(headers)
            data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path)

            with open(data_path, 'rb') as bag_file:
                bag_data = bag_file.read()

            put_res = requests.put(data_url, data=bag_data, headers=headers)

            # Regenerate headers on 401
            if put_res.status_code == 401:
                self.__https_authorizer.handle_missing_authorization()
                self.__https_authorizer.set_authorization_header(headers)
                put_res = requests.put(data_url,
                                       data=bag_data,
                                       headers=headers)

            # Error message on failed PUT or any unexpected response
            if put_res.status_code >= 300:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not upload BDBag to server (error {}):\n{}".format(
                        put_res.status_code, put_res.content))
                }
            elif put_res.status_code != 200:
                print(
                    "Warning: HTTP upload returned status code {}, which was unexpected."
                    .format(put_res.status_code))

            if verbose:
                print("Upload successful to '{}': {} {}".format(
                    data_url, put_res.status_code, put_res.content))

            flow_id = self.flow_info["flow_id"]
            flow_input = {
                "source_endpoint_id": False,
                "data_url": data_url,
                "test_sub": test_sub,
                "dcc_id": dcc_id
            }
            if catalog_id:
                flow_input["catalog_id"] = str(catalog_id)
            if server:
                flow_input["server"] = server

        if verbose:
            print("Flow input populated:\n{}".format(
                json.dumps(flow_input, indent=4, sort_keys=True)))
        # Get Flow scope
        flow_def = self.flow_client.get_flow(flow_id)
        flow_scope = flow_def["globus_auth_scope"]
        # Start Flow
        if verbose:
            print("Starting Flow - Submitting data")
        try:
            flow_res = self.flow_client.run_flow(flow_id, flow_scope,
                                                 flow_input)
        except globus_sdk.GlobusAPIError as e:
            if e.http_status == 404:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not access ingest Flow. Are you in the CFDE DERIVA "
                     "Demo Globus Group? Check your membership or apply for access "
                     "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-"
                     "0efb3ba9a670/about")
                }
            else:
                raise
        self.last_flow_run = {
            "flow_id": flow_id,
            "flow_instance_id": flow_res["action_id"]
        }
        if verbose:
            print("Flow started successfully.")

        return {
            "success":
            True,
            "message":
            ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}".
             format(flow_id, flow_res["action_id"])),
            "flow_id":
            flow_id,
            "flow_instance_id":
            flow_res["action_id"],
            "cfde_dest_path":
            dest_path,
            "http_link":
            "{}{}".format(self.flow_info["cfde_ep_url"], dest_path),
            "globus_web_link":
            ("https://app.globus.org/file-manager?origin_id={}&origin_path={}".
             format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path)))
        }
示例#18
0
def parse_cli():
    description = 'BD2K BDBag utility for working with Bagit/RO archives'

    parser = argparse.ArgumentParser(
        description=description, epilog="For more information see: http://github.com/ini-bdds/bdbag")
    standard_args = parser.add_argument_group('Standard arguments')

    update_arg = standard_args.add_argument(
        '--update', action="store_true",
        help="Update an existing bag dir, regenerating manifests and fetch.txt if necessary.")

    standard_args.add_argument(
        "--archiver", choices=['zip', 'tar', 'tgz'], help="Archive a bag using the specified format.")

    checksum_arg = standard_args.add_argument(
        "--checksum", action='append', choices=['md5', 'sha1', 'sha256', 'sha512', 'all'],
        help="Checksum algorithm to use: can be specified multiple times with different values. "
             "If \'all\' is specified, every supported checksum will be generated")

    skip_manifests_arg = standard_args.add_argument(
        "--skip-manifests", action='store_true',
        help=str("If \'skip-manifests\' is specified in conjunction with %s, only tagfile manifests will be "
                 "regenerated, with payload manifests and fetch.txt (if any) left as is. This argument should be used "
                 "when only bag metadata has changed." % update_arg.option_strings))

    prune_manifests_arg = standard_args.add_argument(
        "--prune-manifests", action='store_true',
        help="If specified, any existing checksum manifests not explicitly configured via either"
             " the \"checksum\" argument(s) or configuration file will be deleted from the bag during an update.")

    fetch_arg = standard_args.add_argument(
        '--resolve-fetch', choices=['all', 'missing'],
        help="Download remote files listed in the bag's fetch.txt file. "
             "The \"missing\" option only attempts to fetch files that do not "
             "already exist in the bag payload directory. "
             "The \"all\" option causes all fetch files to be re-acquired,"
             " even if they already exist in the bag payload directory.")

    standard_args.add_argument(
        '--validate', choices=['fast', 'full'],
        help="Validate a bag directory or bag archive. If \"fast\" is specified, Payload-Oxum (if present) will be "
             "used to check that the payload files are present and accounted for. Otherwise if \"full\" is specified, "
             "all checksums will be regenerated and compared to the corresponding entries in the manifest")

    standard_args.add_argument(
        '--validate-profile', action="store_true",
        help="Validate a bag against the profile specified by the bag's "
             "\"BagIt-Profile-Identifier\" metadata field, if present.")

    standard_args.add_argument(
        '--config-file', default=DEFAULT_CONFIG_FILE, metavar='<file>',
        help="Optional path to a configuration file. If this argument is not specified, the configuration file "
             "defaults to: %s " % DEFAULT_CONFIG_FILE)

    metadata_file_arg = standard_args.add_argument(
        '--metadata-file', metavar='<file>', help="Optional path to a JSON formatted metadata file")

    remote_file_manifest_arg = standard_args.add_argument(
        '--remote-file-manifest', metavar='<file>',
        help="Optional path to a JSON formatted remote file manifest configuration file used to add remote file entries"
             " to the bag manifest(s) and create the bag fetch.txt file.")

    standard_args.add_argument(
        '--quiet', action="store_true", help="Suppress logging output.")

    standard_args.add_argument(
        '--debug', action="store_true", help="Enable debug logging output.")

    standard_args.add_argument(
        '--bag-path', metavar="<path>", required=True,
        help="Path to a bag directory or bag archive file.")

    metadata_args = parser.add_argument_group('Bag metadata arguments')
    for header in bagit.STANDARD_BAG_INFO_HEADERS:
        metadata_args.add_argument('--%s' % header.lower(), action=AddMetadataAction)

    args = parser.parse_args()

    bdb.configure_logging(level=logging.ERROR if args.quiet else (logging.DEBUG if args.debug else logging.INFO))

    path = os.path.abspath(args.bag_path)
    if not os.path.exists(path):
        sys.stderr.write("Error: file or directory not found: %s\n\n" % path)
        sys.exit(2)

    is_file = os.path.isfile(path)
    if args.archiver and is_file:
        sys.stderr.write("Error: A bag archive cannot be created from an existing bag archive.\n\n")
        sys.exit(2)

    if args.checksum and is_file:
        sys.stderr.write("Error: A checksum manifest cannot be added to an existing bag archive. "
                         "The bag must be extracted, updated, and re-archived.\n\n")
        sys.exit(2)

    if args.update and is_file:
        sys.stderr.write("Error: An existing bag archive cannot be updated in-place. "
                         "The bag must first be extracted and then updated.\n\n")
        sys.exit(2)

    if args.update and args.resolve_fetch:
        sys.stderr.write("Error: The %s argument is not compatible with the %s argument.\n\n" %
                         (update_arg.option_strings, fetch_arg.option_strings))
        sys.exit(2)

    if args.remote_file_manifest and args.resolve_fetch:
        sys.stderr.write("Error: The %s argument is not compatible with the %s argument.\n\n" %
                         (remote_file_manifest_arg.option_strings, fetch_arg.option_strings))
        sys.exit(2)

    is_bag = bdb.is_bag(path)
    if args.checksum and not args.update and is_bag:
        sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order "
                         "to apply any changes.\n\n" % (checksum_arg.option_strings, update_arg.option_strings))
        sys.exit(2)

    if args.remote_file_manifest and not args.update and is_bag:
        sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order "
                         "to apply any changes.\n\n" % (remote_file_manifest_arg.option_strings, update_arg.option_strings))
        sys.exit(2)

    if args.metadata_file and not args.update and is_bag:
        sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order "
                         "to apply any changes.\n\n" % (metadata_file_arg.option_strings, update_arg.option_strings))
        sys.exit(2)

    if args.prune_manifests and not args.update and is_bag:
        sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order "
                         "to apply any changes.\n\n" % (prune_manifests_arg.option_strings, update_arg.option_strings))
        sys.exit(2)

    if args.skip_manifests and not args.update and is_bag:
        sys.stderr.write("Error: Specifying %s requires the %s argument.\n\n" %
                         (skip_manifests_arg.option_strings, update_arg.option_strings))
        sys.exit(2)

    if BAG_METADATA and not args.update and is_bag:
        sys.stderr.write("Error: Adding or modifying metadata %s for an existing bag requires the %s argument "
                         "in order to apply any changes.\n\n" % (BAG_METADATA, update_arg.option_strings))
        sys.exit(2)

    return args, is_bag, is_file
示例#19
0
def main():

    sys.stderr.write('\n')

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.bag_path)

    archive = None
    temp_path = None
    error = None
    result = 0

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not ((args.validate or args.validate_profile or args.resolve_fetch) and
                    not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             args.checksum,
                             args.update,
                             args.skip_manifests,
                             args.prune_manifests,
                             BAG_METADATA if BAG_METADATA else None,
                             args.metadata_file,
                             args.remote_file_manifest,
                             args.config_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile or args.resolve_fetch):
            bdb.extract_bag(path)
            sys.stderr.write('\n')
            return result

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            bdb.validate_bag(temp_path if temp_path else path,
                             True if args.validate == 'fast' else False,
                             args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path, profile)

    except Exception as e:
        result = 1
        error = "Error: %s" % bdbag.get_named_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    sys.stderr.write('\n')

    return result