class DerivaDownload(object): """ """ def __init__(self, server, **kwargs): self.server = server self.hostname = None self.catalog = None self.store = None self.cancelled = False self.output_dir = os.path.abspath(kwargs.get("output_dir", ".")) self.envars = kwargs.get("envars", dict()) self.config = kwargs.get("config") self.credentials = kwargs.get("credentials", dict()) config_file = kwargs.get("config_file") credential_file = kwargs.get("credential_file") self.metadata = dict() self.sessions = dict() info = "%s v%s [Python %s, %s]" % ( self.__class__.__name__, get_installed_version(VERSION), platform.python_version(), platform.platform(aliased=True)) logging.info("Initializing downloader: %s" % info) if not self.server: raise DerivaDownloadConfigurationError("Server not specified!") # server variable initialization self.hostname = self.server.get('host', '') if not self.hostname: raise DerivaDownloadConfigurationError("Host not specified!") protocol = self.server.get('protocol', 'https') self.server_url = protocol + "://" + self.hostname catalog_id = self.server.get("catalog_id", "1") session_config = self.server.get('session') # credential initialization token = kwargs.get("token") oauth2_token = kwargs.get("oauth2_token") username = kwargs.get("username") password = kwargs.get("password") if credential_file: self.credentials = get_credential(self.hostname, credential_file) elif token or oauth2_token or (username and password): self.credentials = format_credential(token=token, oauth2_token=oauth2_token, username=username, password=password) # catalog and file store initialization if self.catalog: del self.catalog self.catalog = ErmrestCatalog(protocol, self.hostname, catalog_id, self.credentials, session_config=session_config) if self.store: del self.store self.store = HatracStore(protocol, self.hostname, self.credentials, session_config=session_config) # init dcctx cid to a default self.set_dcctx_cid(self.__class__.__name__) # process config file if config_file: try: self.config = read_config(config_file) except Exception as e: raise DerivaDownloadConfigurationError(e) def set_dcctx_cid(self, cid): assert cid, "A dcctx cid is required" if self.catalog: self.catalog.dcctx['cid'] = cid if self.store: self.store.dcctx['cid'] = cid def set_config(self, config): self.config = config def set_credentials(self, credentials): self.catalog.set_credentials(credentials, self.hostname) self.store.set_credentials(credentials, self.hostname) self.credentials = credentials def download(self, **kwargs): if not self.config: raise DerivaDownloadConfigurationError( "No configuration specified!") if self.config.get("catalog") is None: raise DerivaDownloadConfigurationError( "Catalog configuration error!") ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath(''.join([ os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json" ])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) self.envars.update({"hostname": self.hostname}) # 1. If we don't have a client identity, we need to authenticate identity = kwargs.get("identity") if not identity: try: if not self.credentials: self.set_credentials(get_credential(self.hostname)) logging.info("Validating credentials for host: %s" % self.hostname) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except HTTPError as he: if he.response.status_code == 404: logging.info( "No existing login session found for host: %s" % self.hostname) except Exception as e: raise DerivaDownloadAuthenticationError( "Unable to validate credentials: %s" % format_exception(e)) wallet = kwargs.get("wallet", {}) # 2. Check for bagging config and initialize bag related variables bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get( 'bag_name', ''.join([ "deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S") ])).format(**self.envars) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get( 'bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get( "Contact-Name", None if not identity else identity.get( 'full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest( author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s) outputs = dict() base_path = bag_path if bag_path else self.output_dir for processor in catalog_config['query_processors']: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: query_processor = find_query_processor(processor_name, processor_type) processor = query_processor( self.envars, inputs=outputs, bag=create_bag, catalog=self.catalog, store=self.store, base_path=base_path, processor_params=processor_params, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise # 4. Execute anything in the transform processing pipeline, if configured transform_processors = self.config.get('transform_processors', []) if transform_processors: for processor in transform_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: transform_processor = find_transform_processor( processor_name, processor_type) processor = transform_processor( self.envars, inputs=outputs, processor_params=processor_params, base_path=base_path, bag=create_bag, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise # 5. Create the bag, and archive (serialize) if necessary if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag( bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest if (remote_file_manifest and os.path.getsize(remote_file_manifest) > 0) else None, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s" % format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile( remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) outputs = { os.path.basename(archive): { LOCAL_PATH_KEY: archive } } except Exception as e: logging.error( "Exception while creating data bag archive: %s" % format_exception(e)) raise else: outputs = { os.path.basename(bag_path): { LOCAL_PATH_KEY: bag_path } } # 6. Execute anything in the post processing pipeline, if configured post_processors = self.config.get('post_processors', []) if post_processors: for processor in post_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: post_processor = find_post_processor( processor_name, processor_type) processor = post_processor( self.envars, inputs=outputs, processor_params=processor_params, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise return outputs def __del__(self): for session in self.sessions.values(): session.close()
class DerivaDownload(object): """ """ def __init__(self, server, output_dir=None, kwargs=None, config=None, config_file=None, credentials=None, credential_file=None): self.server = server self.hostname = None self.output_dir = output_dir if output_dir else "." self.envars = kwargs if kwargs else dict() self.catalog = None self.store = None self.config = config self.cancelled = False self.credentials = credentials if credentials else dict() self.metadata = dict() self.sessions = dict() info = "%s v%s [Python %s, %s]" % ( self.__class__.__name__, VERSION, platform.python_version(), platform.platform(aliased=True)) logging.info("Initializing downloader: %s" % info) if not self.server: raise RuntimeError("Server not specified!") # server variable initialization self.hostname = self.server.get('host', '') if not self.hostname: raise RuntimeError("Host not specified!") protocol = self.server.get('protocol', 'https') self.server_url = protocol + "://" + self.hostname catalog_id = self.server.get("catalog_id", "1") session_config = self.server.get('session') # credential initialization if credential_file: self.credentials = get_credential(self.hostname, credential_file) # catalog and file store initialization if self.catalog: del self.catalog self.catalog = ErmrestCatalog( protocol, self.hostname, catalog_id, self.credentials, session_config=session_config) if self.store: del self.store self.store = HatracStore( protocol, self.hostname, self.credentials, session_config=session_config) # process config file if config_file and os.path.isfile(config_file): self.config = read_config(config_file) def setConfig(self, config): self.config = config def setCredentials(self, credentials): self.catalog.set_credentials(credentials, self.hostname) self.store.set_credentials(credentials, self.hostname) self.credentials = credentials def download(self, identity=None): if not self.config: raise RuntimeError("No configuration specified!") if self.config.get("catalog") is None: raise RuntimeError("Catalog configuration error!") if not identity: logging.info("Validating credentials") try: if not self.credentials: self.setCredentials(get_credential(self.hostname)) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except Exception as e: raise RuntimeError("Unable to validate credentials: %s" % format_exception(e)) ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath( ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")])) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get("Contact-Name", identity.get('full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) file_list = list() base_path = bag_path if bag_path else self.output_dir for query in catalog_config['queries']: query_path = query['query_path'] output_format = query['output_format'] output_processor = query.get("output_format_processor") format_args = query.get('output_format_params', None) output_path = query.get('output_path', '') try: download_processor = findProcessor(output_format, output_processor) processor = download_processor(self.envars, bag=create_bag, catalog=self.catalog, store=self.store, query=query_path, base_path=base_path, sub_path=output_path, format_args=format_args, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid) file_list.extend(processor.process()) except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s", format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile(remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) return [archive] except Exception as e: logging.error("Exception while creating data bag archive:", format_exception(e)) raise else: return [bag_path] return file_list