Exemplo n.º 1
0
def main(argv):
    parser = argparse.ArgumentParser(description='Program to create a BDBag containing a set of Minids for remote content')
    parser.add_argument('-m', '--minids', metavar='<minid file>',
                        help='File listing Minids for new bag', required=True)
    parser.add_argument('-b', '--bagname', metavar='<bag name>',
                        help='Name of directory for new bag.', required=True)
    parser.add_argument('-v', '--verify', action='store_true',
                        help='Validate bag after building it.', required=False)
    parser.add_argument('-q', '--quiet', action="store_true", help="Suppress logging output.")
    parser.add_argument('-d', '--debug', action="store_true", help="Enable debug logging output.")
    parser.add_argument('-n', '--author-name', metavar="<person or entity name>",
        help="Optional name of the person or entity responsible for the creation of this bag, "
             "for inclusion in the bag metadata.")
    parser.add_argument('-o', '--author-orcid', metavar="<orcid>",
        help="Optional ORCID identifier of the bag creator, for inclusion in the bag metadata.")
    args = parser.parse_args()
   
    bdb.configure_logging(level=logging.ERROR if args.quiet else (logging.DEBUG if args.debug else logging.INFO))

    # Create the directory that will hold the new BDBag
    bdb.ensure_bag_path_exists(args.bagname)

    # For each supplied minid, fetch sub-bag to determine its properties
    minid_fields = extract_fields(args.minids)

    # Create 'README' file in the newly created bag directory. (moved to 'data' when bag is created)
    write_readme(args.bagname, minid_fields)

    # Create remote_file_manifest_file, to be used by make_bag
    working_dir = temp_path = tempfile.mkdtemp(prefix='encode2bag_')
    remote_file_manifest_file = osp.abspath(osp.join(working_dir, 'remote-file-manifest.json'))
    generate_remote_manifest_file(minid_fields, remote_file_manifest_file)

    # Create the new bag based on the supplied remote manifest file
    bag = bdb.make_bag(args.bagname,
                       algs=['md5', 'sha256'],
                       remote_file_manifest=remote_file_manifest_file)

    # Create metadata/manifest.json file with Research Object JSON object
    ro_manifest = ro.init_ro_manifest(author_name=args.author_name, author_orcid=args.author_orcid,
        creator_name = 'bagofbags using BDBag version: %s (Bagit version: %s)' % (VERSION, BAGIT_VERSION),
        creator_uri='https://github.com/fair-research/bdbag/examples/bagofbags/')
    add_remote_file_manifest_to_ro(ro_manifest, minid_fields)
    ro.write_bag_ro_metadata(ro_manifest, args.bagname, 'manifest.json')

    # Run make_bag again to include manifest.json in the checksums etc.
    bdb.make_bag(args.bagname, update=True)

    if args.verify:
        bdb.resolve_fetch(args.bagname, force=True) 
        bdb.validate_bag(args.bagname, fast=False, callback=None)
Exemplo n.º 2
0
def generate_ro_manifest(bag_path,
                         overwrite=False,
                         config_file=DEFAULT_CONFIG_FILE):
    bag = bdbagit.BDBag(bag_path)
    bag_ro_metadata_path = os.path.abspath(
        os.path.join(bag_path, "metadata", "manifest.json"))
    exists = os.path.isfile(bag_ro_metadata_path)
    if exists and not overwrite:
        logger.info("Auto-generating RO manifest: update existing file.")
        ro_metadata = bdbro.read_bag_ro_metadata(bag_path)
    else:
        logger.info(
            "Auto-generating RO manifest: %s." %
            "creating new file" if not exists else "overwrite existing file")
        ro_metadata = bdbro.init_ro_manifest(
            author_name=bag.info.get("Contact-Name"),
            author_orcid=bag.info.get("Contact-Orcid"),
            creator_name=bdbro.BAG_CREATOR_NAME,
            creator_uri=bdbro.BAG_CREATOR_URI)

    config = read_config(config_file)
    resolvers = config.get(
        ID_RESOLVER_TAG,
        DEFAULT_ID_RESOLVERS) if config else DEFAULT_ID_RESOLVERS
    fetched = bag.fetch_entries()
    local = bag.payload_files()

    for url, length, filename in fetched:
        if url.startswith("minid:") or url.startswith("ark:"):
            url = "".join(["http://", resolvers[0], "/", url])
        bdbro.add_file_metadata(ro_metadata,
                                source_url=url,
                                bundled_as=bdbro.make_bundled_as(
                                    folder=os.path.dirname(filename),
                                    filename=os.path.basename(filename)),
                                update_existing=True)

    for path in local:
        bdbro.add_file_metadata(ro_metadata,
                                local_path=path.replace("\\", "/"),
                                bundled_as=bdbro.make_bundled_as(),
                                update_existing=True)

    bdbro.write_bag_ro_metadata(ro_metadata, bag_path)
    profile = bag.info.get(BAG_PROFILE_TAG)
    if profile == BDBAG_PROFILE_ID:
        bag.info.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID})
    bag.save()
Exemplo n.º 3
0
    def download(self, **kwargs):

        if not self.config:
            raise DerivaDownloadConfigurationError(
                "No configuration specified!")

        if self.config.get("catalog") is None:
            raise DerivaDownloadConfigurationError(
                "Catalog configuration error!")

        ro_manifest = None
        ro_author_name = None
        ro_author_orcid = None
        remote_file_manifest = os.path.abspath(''.join([
            os.path.join(self.output_dir, 'remote-file-manifest_'),
            str(uuid.uuid4()), ".json"
        ]))

        catalog_config = self.config['catalog']
        self.envars.update(self.config.get('env', dict()))
        self.envars.update({"hostname": self.hostname})

        # 1. If we don't have a client identity, we need to authenticate
        identity = kwargs.get("identity")
        if not identity:
            try:
                if not self.credentials:
                    self.set_credentials(get_credential(self.hostname))
                logging.info("Validating credentials for host: %s" %
                             self.hostname)
                attributes = self.catalog.get_authn_session().json()
                identity = attributes["client"]
            except HTTPError as he:
                if he.response.status_code == 404:
                    logging.info(
                        "No existing login session found for host: %s" %
                        self.hostname)
            except Exception as e:
                raise DerivaDownloadAuthenticationError(
                    "Unable to validate credentials: %s" % format_exception(e))
        wallet = kwargs.get("wallet", {})

        # 2. Check for bagging config and initialize bag related variables
        bag_path = None
        bag_archiver = None
        bag_algorithms = None
        bag_config = self.config.get('bag')
        create_bag = True if bag_config else False
        if create_bag:
            bag_name = bag_config.get(
                'bag_name', ''.join([
                    "deriva_bag", '_',
                    time.strftime("%Y-%m-%d_%H.%M.%S")
                ])).format(**self.envars)
            bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name))
            bag_archiver = bag_config.get('bag_archiver')
            bag_algorithms = bag_config.get('bag_algorithms', ['sha256'])
            bag_metadata = bag_config.get(
                'bag_metadata',
                {"Internal-Sender-Identifier": "deriva@%s" % self.server_url})
            bag_ro = create_bag and stob(bag_config.get('bag_ro', "True"))
            if create_bag:
                bdb.ensure_bag_path_exists(bag_path)
                bag = bdb.make_bag(bag_path,
                                   algs=bag_algorithms,
                                   metadata=bag_metadata)
                if bag_ro:
                    ro_author_name = bag.info.get(
                        "Contact-Name", None if not identity else identity.get(
                            'full_name',
                            identity.get('display_name',
                                         identity.get('id', None))))
                    ro_author_orcid = bag.info.get("Contact-Orcid")
                    ro_manifest = ro.init_ro_manifest(
                        author_name=ro_author_name,
                        author_orcid=ro_author_orcid)
                    bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID})

        # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s)
        outputs = dict()
        base_path = bag_path if bag_path else self.output_dir
        for processor in catalog_config['query_processors']:
            processor_name = processor["processor"]
            processor_type = processor.get('processor_type')
            processor_params = processor.get('processor_params')

            try:
                query_processor = find_query_processor(processor_name,
                                                       processor_type)
                processor = query_processor(
                    self.envars,
                    inputs=outputs,
                    bag=create_bag,
                    catalog=self.catalog,
                    store=self.store,
                    base_path=base_path,
                    processor_params=processor_params,
                    remote_file_manifest=remote_file_manifest,
                    ro_manifest=ro_manifest,
                    ro_author_name=ro_author_name,
                    ro_author_orcid=ro_author_orcid,
                    identity=identity,
                    wallet=wallet)
                outputs = processor.process()
            except Exception as e:
                logging.error(format_exception(e))
                if create_bag:
                    bdb.cleanup_bag(bag_path)
                raise

        # 4. Execute anything in the transform processing pipeline, if configured
        transform_processors = self.config.get('transform_processors', [])
        if transform_processors:
            for processor in transform_processors:
                processor_name = processor["processor"]
                processor_type = processor.get('processor_type')
                processor_params = processor.get('processor_params')
                try:
                    transform_processor = find_transform_processor(
                        processor_name, processor_type)
                    processor = transform_processor(
                        self.envars,
                        inputs=outputs,
                        processor_params=processor_params,
                        base_path=base_path,
                        bag=create_bag,
                        ro_manifest=ro_manifest,
                        ro_author_name=ro_author_name,
                        ro_author_orcid=ro_author_orcid,
                        identity=identity,
                        wallet=wallet)
                    outputs = processor.process()
                except Exception as e:
                    logging.error(format_exception(e))
                    raise

        # 5. Create the bag, and archive (serialize) if necessary
        if create_bag:
            try:
                if ro_manifest:
                    ro.write_bag_ro_metadata(ro_manifest, bag_path)
                if not os.path.isfile(remote_file_manifest):
                    remote_file_manifest = None
                bdb.make_bag(
                    bag_path,
                    algs=bag_algorithms,
                    remote_file_manifest=remote_file_manifest if
                    (remote_file_manifest
                     and os.path.getsize(remote_file_manifest) > 0) else None,
                    update=True)
            except Exception as e:
                logging.fatal("Exception while updating bag manifests: %s" %
                              format_exception(e))
                bdb.cleanup_bag(bag_path)
                raise
            finally:
                if remote_file_manifest and os.path.isfile(
                        remote_file_manifest):
                    os.remove(remote_file_manifest)

            logging.info('Created bag: %s' % bag_path)

            if bag_archiver is not None:
                try:
                    archive = bdb.archive_bag(bag_path, bag_archiver.lower())
                    bdb.cleanup_bag(bag_path)
                    outputs = {
                        os.path.basename(archive): {
                            LOCAL_PATH_KEY: archive
                        }
                    }
                except Exception as e:
                    logging.error(
                        "Exception while creating data bag archive: %s" %
                        format_exception(e))
                    raise
            else:
                outputs = {
                    os.path.basename(bag_path): {
                        LOCAL_PATH_KEY: bag_path
                    }
                }

        # 6. Execute anything in the post processing pipeline, if configured
        post_processors = self.config.get('post_processors', [])
        if post_processors:
            for processor in post_processors:
                processor_name = processor["processor"]
                processor_type = processor.get('processor_type')
                processor_params = processor.get('processor_params')
                try:
                    post_processor = find_post_processor(
                        processor_name, processor_type)
                    processor = post_processor(
                        self.envars,
                        inputs=outputs,
                        processor_params=processor_params,
                        identity=identity,
                        wallet=wallet)
                    outputs = processor.process()
                except Exception as e:
                    logging.error(format_exception(e))
                    raise

        return outputs
Exemplo n.º 4
0
    def download(self, identity=None):

        if not self.config:
            raise RuntimeError("No configuration specified!")

        if self.config.get("catalog") is None:
            raise RuntimeError("Catalog configuration error!")

        if not identity:
            logging.info("Validating credentials")
            try:
                if not self.credentials:
                    self.setCredentials(get_credential(self.hostname))
                attributes = self.catalog.get_authn_session().json()
                identity = attributes["client"]
            except Exception as e:
                raise RuntimeError("Unable to validate credentials: %s" % format_exception(e))

        ro_manifest = None
        ro_author_name = None
        ro_author_orcid = None
        remote_file_manifest = os.path.abspath(
            ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"]))

        catalog_config = self.config['catalog']
        self.envars.update(self.config.get('env', dict()))

        bag_path = None
        bag_archiver = None
        bag_algorithms = None
        bag_config = self.config.get('bag')
        create_bag = True if bag_config else False
        if create_bag:
            bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")]))
            bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name))
            bag_archiver = bag_config.get('bag_archiver')
            bag_algorithms = bag_config.get('bag_algorithms', ['sha256'])
            bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier":
                                                           "deriva@%s" % self.server_url})
            bag_ro = create_bag and stob(bag_config.get('bag_ro', "True"))
            if create_bag:
                bdb.ensure_bag_path_exists(bag_path)
                bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata)
                if bag_ro:
                    ro_author_name = bag.info.get("Contact-Name",
                                                  identity.get('full_name',
                                                               identity.get('display_name',
                                                                            identity.get('id', None))))
                    ro_author_orcid = bag.info.get("Contact-Orcid")
                    ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid)
                    bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID})

        file_list = list()
        base_path = bag_path if bag_path else self.output_dir
        for query in catalog_config['queries']:
            query_path = query['query_path']
            output_format = query['output_format']
            output_processor = query.get("output_format_processor")
            format_args = query.get('output_format_params', None)
            output_path = query.get('output_path', '')

            try:
                download_processor = findProcessor(output_format, output_processor)
                processor = download_processor(self.envars,
                                               bag=create_bag,
                                               catalog=self.catalog,
                                               store=self.store,
                                               query=query_path,
                                               base_path=base_path,
                                               sub_path=output_path,
                                               format_args=format_args,
                                               remote_file_manifest=remote_file_manifest,
                                               ro_manifest=ro_manifest,
                                               ro_author_name=ro_author_name,
                                               ro_author_orcid=ro_author_orcid)
                file_list.extend(processor.process())
            except Exception as e:
                logging.error(format_exception(e))
                if create_bag:
                    bdb.cleanup_bag(bag_path)
                raise

        if create_bag:
            try:
                if ro_manifest:
                    ro.write_bag_ro_metadata(ro_manifest, bag_path)
                if not os.path.isfile(remote_file_manifest):
                    remote_file_manifest = None
                bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True)
            except Exception as e:
                logging.fatal("Exception while updating bag manifests: %s", format_exception(e))
                bdb.cleanup_bag(bag_path)
                raise
            finally:
                if remote_file_manifest and os.path.isfile(remote_file_manifest):
                    os.remove(remote_file_manifest)

            logging.info('Created bag: %s' % bag_path)

            if bag_archiver is not None:
                try:
                    archive = bdb.archive_bag(bag_path, bag_archiver.lower())
                    bdb.cleanup_bag(bag_path)
                    return [archive]
                except Exception as e:
                    logging.error("Exception while creating data bag archive:", format_exception(e))
                    raise
            else:
                return [bag_path]

        return file_list