def __call__(self, fcrepo, args): if args.notransactions: try: collection = pcdm.Collection() collection.title = args.name collection.create_object(fcrepo) collection.update_object(fcrepo) except RESTAPIException as e: logger.error(f'Error in collection creation: {e}') raise FailureException() else: with Transaction(fcrepo) as txn: try: collection = pcdm.Collection() collection.title = args.name collection.create_object(fcrepo) collection.update_object(fcrepo) txn.commit() except RESTAPIException as e: logger.error(f'Error in collection creation: {e}') raise FailureException() if args.batch is not None: with open(args.batch, 'r') as batchconfig: batch = yaml.safe_load(batchconfig) batch['COLLECTION'] = str(collection.uri) with open(args.batch, 'w') as batchconfig: yaml.dump(batch, batchconfig, default_flow_style=False)
def __call__(self, fcrepo, args): fieldnames = ['uri', 'timestamp'] # read the log of completed items try: completed = util.ItemLog('logs/annotated.csv', fieldnames, 'uri') except Exception as e: logger.error('Non-standard map file specified: {0}'.format(e)) raise FailureException() logger.info('Found {0} completed items'.format(len(completed))) if args.ignore is not None: try: ignored = util.ItemLog(args.ignore, fieldnames, 'uri') except Exception as e: logger.error('Non-standard ignore file specified: {0}'.format(e)) raise FailureException() else: ignored = [] skipfile = 'logs/skipped.extractocr.{0}.csv'.format(now) skipped = util.ItemLog(skipfile, fieldnames, 'uri') with fcrepo.at_path('/annotations'): for line in sys.stdin: uri = line.rstrip('\n') if uri in completed: continue elif uri in ignored: logger.debug('Ignoring {0}'.format(uri)) continue try: is_extracted = extract(fcrepo, uri) except RESTAPIException: logger.error( "Unable to commit or rollback transaction, aborting" ) raise FailureException() row = { 'uri': uri, 'timestamp': str(datetime.utcnow()) } if is_extracted: completed.writerow(row) else: skipped.writerow(row)
def request(self, method, url, headers=None, **kwargs): if headers is None: headers = {} # make sure the transaction keep-alive thread hasn't failed if self.in_transaction() and self.transaction.keep_alive.failed.is_set( ): raise FailureException('Transaction keep-alive failed' ) from self.transaction.keep_alive.exception target_uri = self._insert_transaction_uri(url) if self.is_forwarded(): # Reverse forward target_uri = self.undo_forward(target_uri) self.logger.debug("%s %s", method, target_uri) if self.ua_string is not None: headers['User-Agent'] = self.ua_string if self.delegated_user is not None: headers['On-Behalf-Of'] = self.delegated_user self.auth.refresh_auth(self.session) response = self.session.request(method, target_uri, headers=headers, **kwargs) self.logger.debug("%s %s", response.status_code, response.reason) return response
def parse_message(message): access = message.args.get('access') message.body = message.body.encode('utf-8').decode('utf-8-sig') if access is not None: try: access_uri = uri_or_curie(access) except ArgumentTypeError as e: raise FailureException(f'PlastronArg-access {e}') else: access_uri = None return Namespace( model=message.args.get('model'), limit=message.args.get('limit', None), percentage=message.args.get('percent', None), validate_only=message.args.get('validate-only', False), resume=message.args.get('resume', False), import_file=io.StringIO(message.body), template_file=None, access=access_uri, member_of=message.args.get('member-of'), binaries_location=message.args.get('binaries-location'), container=message.args.get('container', None), extract_text_types=message.args.get('extract-text', None), job_id=message.job_id, structure=message.args.get('structure', None), relpath=message.args.get('relpath', None) )
def __call__(self, message: PlastronCommandMessage, progress_topic: Destination): # determine which command to load to process the message command = self.get_command(message.command) if message.job_id is None: raise FailureException('Expecting a PlastronJobId header') logger.info(f'Received message to initiate job {message.job_id}') args = command.parse_message(message) cmd_repo_config = command.repo_config(self.repo_config, args) repo = Repository(config=cmd_repo_config, ua_string=f'plastron/{version}', on_behalf_of=message.args.get('on-behalf-of')) if repo.delegated_user is not None: logger.info( f'Running repository operations on behalf of {repo.delegated_user}' ) for status in (command.execute(repo, args) or []): progress_topic.send( PlastronMessage(job_id=message.job_id, body=status)) logger.info(f'Job {message.job_id} complete') # default message state is "Done" return message.response(state=command.result.get('type', 'Done'), body=command.result)
def get_command_class(command_name: str): module_name = command_name if command_name == 'import': # special case for the import command, to avoid conflict # with the "import" keyword module_name += 'command' try: command_module = import_module('plastron.commands.' + module_name) except ModuleNotFoundError as e: raise FailureException( f'Unable to load a command with the name {command_name}') from e command_class = getattr(command_module, 'Command') if command_class is None: raise FailureException( f'Command class not found in module {command_module}') return command_class
def __call__(self, repo: Repository, args: Namespace) -> None: csv_file = csv.DictReader(args.source_file) if csv_file.fieldnames is None: logger.error(f'No fields found in {csv_file}. Exiting.') sys.exit(1) if args.output_file is not None: output_file = open(args.output_file, 'w') else: output_file = sys.stdout csv_writer = csv.DictWriter(output_file, fieldnames=csv_file.fieldnames) write_csv_header(csv_file, args, csv_writer) for n, row in enumerate(csv_file, start=1): identifier = row[args.identifier_column] source = get_source(row[args.binary_column]) if not source: logger.warning(f'No source found for {identifier}; skipping') csv_writer.writerow(row) continue item = Item(identifier=identifier, title=f'Stub for {identifier}') file = File() file.source = source item.add_file(file) if args.member_of is not None: item.member_of = URIRef(args.member_of) if args.access is not None: item.rdf_type.append(args.access) file.rdf_type.append(args.access) try: with Transaction(repo) as txn: try: item.create(repo, container_path=args.container_path) item.update(repo) # update the CSV with the new URI row[args.binary_column] = file.uri csv_writer.writerow(row) txn.commit() except (RESTAPIException, FileNotFoundError) as e: # if anything fails during item creation or committing the transaction # attempt to rollback the current transaction # failures here will be caught by the main loop's exception handler # and should trigger a system exit logger.error(f'{item.identifier} not created: {e}') txn.rollback() except KeyboardInterrupt: logger.warning("Load interrupted") txn.rollback() raise except RESTAPIException as e: raise FailureException(f'Transaction rollback failed: {e}') from e if output_file is not sys.stdout: output_file.close()
def model_class(self): if self._model_class is None: # Retrieve the model to use for validation try: self._model_class = getattr( importlib.import_module("plastron.models"), self.model) except AttributeError as e: raise FailureException( f'Unable to load model "{self.model}"') from e return self._model_class
def __exit__(self, exc_type, exc_val, exc_tb): # when we leave the transaction context, always # set the stop flag on the keep-alive ping self.keep_alive.stop() # on an exception, rollback the transaction if exc_type is not None: if exc_type == TransactionError: raise FailureException(f'Transaction failed: {exc_val}') self.rollback() # return false to propagate the exception upward return False
def start(self): """ Sets the timestamp for this run, and creates the log directory for it. :return: """ if self.dir is not None: raise FailureException('Run completed, cannot start again') self.timestamp = datetimestamp() self.dir = self.job.dir / self.timestamp os.makedirs(self.dir) return self
def load(self, timestamp: str): """ Load an existing import run by its timestamp. :param timestamp: should be 14 digits expressing YYYYMMDDHHMMSS :return: """ self.timestamp = timestamp self.dir = self.job.dir / self.timestamp if not self.dir.is_dir(): raise FailureException(f'Import run {self.timestamp} not found') return self
def __call__(self, fcrepo, args): logger.warning( 'The "mkcol" command is deprecated and will be removed in a future release.' ) logger.warning( f'Use: plastron create --container "{fcrepo.relpath}" --collection "{args.name}"' ) if args.notransactions: try: collection = pcdm.Collection() collection.title = args.name collection.create(fcrepo, recursive=False) collection.update(fcrepo, recursive=False) except RESTAPIException as e: logger.error(f'Error in collection creation: {e}') raise FailureException() else: with Transaction(fcrepo) as txn: try: collection = pcdm.Collection() collection.title = args.name collection.create(fcrepo, recursive=False) collection.update(fcrepo, recursive=False) txn.commit() except RESTAPIException as e: logger.error(f'Error in collection creation: {e}') raise FailureException() if args.batch is not None: with open(args.batch, 'r') as batchconfig: batch = yaml.safe_load(batchconfig) batch['COLLECTION'] = str(collection.uri) with open(args.batch, 'w') as batchconfig: yaml.dump(batch, batchconfig, default_flow_style=False)
def process(self, method, use_transaction=True, traverse=None): self.use_transaction = use_transaction if traverse is not None: predicate_list = ', '.join(p.n3() for p in traverse) logger.info( f"{method.__name__} will traverse the following predicates: {predicate_list}" ) if use_transaction: # set up a temporary ItemLog that will be copied to the real item log upon completion of the transaction self.completed_buffer = ItemLog(NamedTemporaryFile().name, ['uri', 'title', 'timestamp'], 'uri', header=False) with Transaction(self.repository, keep_alive=90) as transaction: for resource, graph in self.get_resources(traverse=traverse): try: method(resource, graph) except RESTAPIException as e: logger.error( f'{method.__name__} failed for {resource}: {e}: {e.response.text}' ) # if anything fails while processing of the list of uris, attempt to # rollback the transaction. Failures here will be caught by the main # loop's exception handler and should trigger a system exit try: transaction.rollback() logger.warning('Transaction rolled back.') return False except RESTAPIException: logger.error( 'Unable to roll back transaction, aborting') raise FailureException() transaction.commit() if self.completed and self.completed.filename: shutil.copyfile(self.completed_buffer.filename, self.completed.filename) return True else: for resource, graph in self.get_resources(traverse=traverse): try: method(resource, graph) except RESTAPIException as e: logger.error( f'{method.__name__} failed for {resource}: {e}: {e.response.text}' ) logger.warning( f'Continuing {method.__name__} with next item') return True
def get_command(self, command_name: str): if command_name not in self.commands: # get the configuration options for this command config = self.command_config.get(command_name.upper(), {}) command_class = get_command_class(command_name) if getattr(command_class, 'parse_message') is None: raise FailureException( f'Command class {command_class} does not support message processing' ) # cache an instance of this command self.commands[command_name] = command_class(config) return self.commands[command_name]
def get_ssh_client(sftp_uri, **kwargs): if isinstance(sftp_uri, str): sftp_uri = urlsplit(sftp_uri) if not isinstance(sftp_uri, urllib.parse.SplitResult): raise TypeError('Expects a str or a urllib.parse.SplitResult') ssh_client = SSHClient() ssh_client.load_system_host_keys() ssh_client.set_missing_host_key_policy(AutoAddPolicy) try: ssh_client.connect(hostname=sftp_uri.hostname, username=sftp_uri.username, port=sftp_uri.port or SSH_PORT, **kwargs) return ssh_client except SSHException as e: raise FailureException(str(e)) from e
def execute(self, fcrepo, args): self.repository = fcrepo self.repository.test_connection() self.dry_run = args.dry_run self.validate = args.validate self.model = args.model self.stats = { 'updated': [], 'invalid': defaultdict(list), 'errors': defaultdict(list) } if self.validate and not self.model: raise FailureException( "Model must be provided when performing validation") self.sparql_update = args.update_file.read().encode('utf-8') logger.debug(f'SPARQL Update query:\n' f'====BEGIN====\n' f'{self.sparql_update.decode()}\n' f'=====END=====') if self.dry_run: logger.info('Dry run enabled, no actual updates will take place') self.resources = ResourceList(repository=self.repository, uri_list=args.uris, file=args.file, completed_file=args.completed) self.resources.process(method=self.update_item, traverse=parse_predicate_list(args.recursive), use_transaction=args.use_transactions) if len(self.stats['errors']) == 0 and len(self.stats['invalid']) == 0: state = 'update_complete' else: state = 'update_incomplete' self.result = {'type': state, 'stats': self.stats} logger.debug(self.stats)
def __init__(self, repository, uri_list=None, file=None, completed_file=None): self.repository = repository self.uri_list = uri_list self.file = file self.use_transaction = True if completed_file is not None: logger.info( f'Reading the completed items log from {completed_file}') # read the log of completed items fieldnames = ['uri', 'title', 'timestamp'] try: self.completed = ItemLog(completed_file, fieldnames, 'uri') logger.info(f'Found {len(self.completed)} completed item(s)') except Exception as e: logger.error(f"Non-standard map file specified: {e}") raise FailureException() else: self.completed = None self.completed_buffer = None
def __call__(self, fcrepo, args): try: fcrepo.test_connection() except Exception: raise FailureException()
def execute(self, repo, args): """ Performs the import :param repo: the repository configuration :param args: the command-line arguments """ start_time = datetime.now().timestamp() if args.resume and args.job_id is None: raise FailureException('Resuming a job requires a job id') if args.job_id is None: # TODO: generate a more unique id? add in user and hostname? args.job_id = f"import-{datetimestamp()}" job: ImportJob = Command.create_import_job(args.job_id, jobs_dir=self.jobs_dir) logger.debug(f'Job directory is {job.dir}') if args.resume and not job.dir_exists: raise FailureException(f'Cannot resume job {job.id}: no such job directory found in {self.jobs_dir}') # load or create config if args.resume: logger.info(f'Resuming saved job {job.id}') # load stored config from the previous run of this job try: job.load_config() except FileNotFoundError: raise FailureException(f'Cannot resume job {job.id}: no config.yml found in {job.dir}') else: if args.model is None: raise FailureException('A model is required unless resuming an existing job') job.save_config({ 'model': args.model, 'access': args.access, 'member_of': args.member_of, # Use "repo.relpath" as default for "container", # but allow it to be overridden by args 'container': args.container or repo.relpath, 'binaries_location': args.binaries_location }) if args.template_file is not None: if not hasattr(job.model_class, 'HEADER_MAP'): logger.error(f'{job.model_class.__name__} has no HEADER_MAP, cannot create template') raise FailureException() logger.info(f'Writing template for the {job.model_class.__name__} model to {args.template_file.name}') writer = csv.writer(args.template_file) writer.writerow(list(job.model_class.HEADER_MAP.values()) + ['FILES']) return if args.import_file is None and not args.resume: raise FailureException('An import file is required unless resuming an existing job') if args.percentage: logger.info(f'Loading {args.percentage}% of the total items') if args.validate_only: logger.info('Validation-only mode, skipping imports') # if an import file was provided, save that as the new CSV metadata file if args.import_file is not None: job.store_metadata_file(args.import_file) try: metadata = job.metadata(limit=args.limit, percentage=args.percentage) except ModelClassNotFoundError as e: raise FailureException(f'Model class {e.model_name} not found') from e except JobError as e: raise FailureException(str(e)) from e if metadata.has_binaries and job.binaries_location is None: raise ConfigError('Must specify --binaries-location if the metadata has a FILES column') initial_completed_item_count = len(job.completed_log) logger.info(f'Found {initial_completed_item_count} completed items') updated_uris = [] created_uris = [] import_run = job.new_run().start() for row in metadata: repo_changeset = create_repo_changeset(repo, metadata, row) item = repo_changeset.item # count the number of files referenced in this row metadata.files += len(row.filenames) try: report = validate(item) except ValidationError as e: raise FailureException(f'Unable to run validation: {e}') from e metadata.validation_reports.append({ 'line': row.line_reference, 'is_valid': report.is_valid(), 'passed': [outcome for outcome in report.passed()], 'failed': [outcome for outcome in report.failed()] }) missing_files = [ name for name in row.filenames if not self.get_source(job.binaries_location, name).exists() ] if len(missing_files) > 0: logger.warning(f'{len(missing_files)} file(s) for "{item}" not found') if report.is_valid() and len(missing_files) == 0: metadata.valid += 1 logger.info(f'"{item}" is valid') else: # drop invalid items metadata.invalid += 1 logger.warning(f'"{item}" is invalid, skipping') reasons = [' '.join(str(f) for f in outcome) for outcome in report.failed()] if len(missing_files) > 0: reasons.extend(f'Missing file: {f}' for f in missing_files) import_run.drop_invalid( item=item, line_reference=row.line_reference, reason=f'Validation failures: {"; ".join(reasons)}' ) continue if args.validate_only: # validation-only mode continue try: self.update_repo(args, job, repo, metadata, row, repo_changeset, created_uris, updated_uris) except FailureException as e: metadata.errors += 1 logger.error(f'{item} import failed: {e}') import_run.drop_failed(item, row.line_reference, reason=str(e)) # update the status now = datetime.now().timestamp() yield { 'time': { 'started': start_time, 'now': now, 'elapsed': now - start_time }, 'count': metadata.stats() } logger.info(f'Skipped {metadata.skipped} items') logger.info(f'Completed {len(job.completed_log) - initial_completed_item_count} items') logger.info(f'Dropped {len(import_run.invalid_items)} invalid items') logger.info(f'Dropped {len(import_run.failed_items)} failed items') logger.info(f"Found {metadata.valid} valid items") logger.info(f"Found {metadata.invalid} invalid items") logger.info(f"Found {metadata.errors} errors") if not args.validate_only: logger.info(f"{metadata.unchanged} of {metadata.total} items remained unchanged") logger.info(f"Created {metadata.created} of {metadata.total} items") logger.info(f"Updated {metadata.updated} of {metadata.total} items") if args.validate_only: # validate phase if metadata.invalid == 0: result_type = 'validate_success' else: result_type = 'validate_failed' else: # import phase if len(job.completed_log) == metadata.total: result_type = 'import_complete' else: result_type = 'import_incomplete' self.result = { 'type': result_type, 'validation': metadata.validation_reports, 'count': metadata.stats() }
def __call__(self, fcrepo, args): # Load batch configuration try: batch_config = BatchConfig(args.batch) except ConfigException as e: logger.error(e.message) logger.error( f'Failed to load batch configuration from {args.batch}') raise FailureException(e.message) logger.info(f'Loaded batch configuration from {args.batch}') if not os.path.isdir(batch_config.log_dir): os.makedirs(batch_config.log_dir) fcrepo.load_binaries = args.load_binaries # Define the data_handler function for the data being loaded logger.info("Initializing data handler") module_name = batch_config.handler handler = import_module('plastron.handlers.' + module_name) logger.info('Loaded "{0}" handler'.format(module_name)) # "--no-binaries" implies "--no-annotations" if not args.load_binaries: logger.info("Setting --no-binaries implies --no-annotations") args.create_annotations = False try: batch = handler.Batch(fcrepo, batch_config) except (ConfigException, DataReadException) as e: logger.error(e.message) logger.error('Failed to initialize batch') raise FailureException(e.message) if not args.dry_run: fcrepo.test_connection() # read the log of completed items fieldnames = ['number', 'timestamp', 'title', 'path', 'uri'] try: completed = ItemLog(batch_config.mapfile, fieldnames, 'path') except Exception as e: logger.error(f"Non-standard map file specified: {e}") raise FailureException() logger.info(f"Found {len(completed)} completed items") if args.ignore is not None: try: ignored = ItemLog(args.ignore, fieldnames, 'path') except Exception as e: logger.error(f"Non-standard ignore file specified: {e}") raise FailureException() else: ignored = [] skipfile = os.path.join(batch_config.log_dir, 'skipped.load.{0}.csv'.format(now)) skipped = ItemLog(skipfile, fieldnames, 'path') load_set = get_load_set(batch, args.percent) # create all batch objects in repository for n, item in enumerate(batch): is_loaded = False if n not in load_set: logger.info(f"Loading {args.percent}, skipping item {n}") continue # handle load limit parameter if args.limit is not None and n >= args.limit: logger.info(f"Stopping after {args.limit} item(s)") break elif item.path in completed: continue elif item.path in ignored: logger.debug(f"Ignoring {item.path}") continue logger.info(f"Processing item {n + 1}/{batch.length}...") try: logger.info(f"Loading item {n + 1}") is_loaded = load_item(fcrepo, item, args, extra=batch_config.extra) except RESTAPIException: logger.error( "Unable to commit or rollback transaction, aborting") raise FailureException() except DataReadException as e: logger.error(f"Skipping item {n + 1}: {e.message}") row = { 'number': n + 1, 'path': item.path, 'timestamp': getattr(item, 'creation_timestamp', str(datetime.utcnow())), 'title': getattr(item, 'title', 'N/A'), 'uri': getattr(item, 'uri', 'N/A') } # write item details to relevant summary CSV if is_loaded: completed.writerow(row) else: skipped.writerow(row) if args.wait: logger.info("Pausing {0} seconds".format(args.wait)) sleep(int(args.wait))
def update_repo(self, args, job, repo, metadata, row, repo_changeset, created_uris, updated_uris): """ Updates the repository with the given RepoChangeSet :param args: the arguments from the command-line :param job: The ImportJob :param repo: the repository configuration :param metadata: A plastron.jobs.MetadataRows object representing the CSV file being imported :param row: A single plastron.jobs.Row object representing the row being imported :param repo_changeset: The RepoChangeSet object describing the changes to make to the repository. :param created_uris: Accumulator storing a list of created URIS. This variable is MODIFIED by this method. :param updated_uris: Accumulator storing a list of updated URIS. This variable is MODIFIED by this method. """ item = repo_changeset.item if not item.created: # if an item is new, don't construct a SPARQL Update query # instead, just create and update normally # create new item in the repo logger.debug('Creating a new item') # add the access class if job.access is not None: item.rdf_type.append(URIRef(job.access)) # add the collection membership if job.member_of is not None: item.member_of = URIRef(job.member_of) if row.has_files: create_pages = bool(strtobool(row.get('CREATE_PAGES', 'True'))) logger.debug('Adding pages and files to new item') self.add_files( item, build_file_groups(row['FILES']), base_location=job.binaries_location, access=job.access, create_pages=create_pages ) if args.extract_text_types is not None: annotate_from_files(item, args.extract_text_types.split(',')) logger.debug(f"Creating resources in container: {job.container}") try: with Transaction(repo) as txn: item.create(repo, container_path=job.container) item.update(repo) txn.commit() except Exception as e: raise FailureException(f'Creating item failed: {e}') from e job.complete(item, row.line_reference, ImportedItemStatus.CREATED) metadata.created += 1 created_uris.append(item.uri) elif repo_changeset: # construct the SPARQL Update query if there are any deletions or insertions # then do a PATCH update of an existing item logger.info(f'Sending update for {item}') sparql_update = repo_changeset.build_sparql_update(repo) logger.debug(sparql_update) try: item.patch(repo, sparql_update) except RESTAPIException as e: raise FailureException(f'Updating item failed: {e}') from e job.complete(item, row.line_reference, ImportedItemStatus.MODIFIED) metadata.updated += 1 updated_uris.append(item.uri) else: job.complete(item, row.line_reference, ImportedItemStatus.UNCHANGED) metadata.unchanged += 1 logger.info(f'No changes found for "{item}" ({row.uri}); skipping') metadata.skipped += 1
def execute(self, fcrepo, args): start_time = datetime.now().timestamp() count = 0 errors = 0 total = len(args.uris) try: serializer_class = SERIALIZER_CLASSES[args.format] except KeyError: logger.error(f'Unknown format: {args.format}') raise FailureException() if args.export_binaries and args.binary_types is not None: # filter files by their MIME type def mime_type_filter(file): return str(file.mimetype) in args.binary_types.split(',') else: # default filter is None; in this case filter() will return # all items that evaluate to true mime_type_filter = None logger.info(f'Export destination: {args.output_dest}') # create a bag in a temporary directory to hold exported items temp_dir = TemporaryDirectory() bag = make_bag(temp_dir.name) export_dir = os.path.join(temp_dir.name, 'data') serializer = serializer_class(directory=export_dir, public_uri_template=args.uri_template) for uri in args.uris: try: logger.info(f'Exporting item {count + 1}/{total}: {uri}') # derive an item-level directory name from the URI # currently this is hard-coded to look for a UUID # TODO: expand to other types of unique ids? match = UUID_REGEX.search(uri) if match is None: raise DataReadException(f'No UUID found in {uri}') item_dir = match[0] graph = fcrepo.get_graph(uri) model_class = detect_resource_class(graph, uri, fallback=Item) obj = model_class.from_graph(graph, uri) if args.export_binaries: logger.info(f'Gathering binaries for {uri}') binaries = list( filter(mime_type_filter, obj.gather_files(fcrepo))) total_size = sum(int(file.size[0]) for file in binaries) size, unit = format_size(total_size) logger.info( f'Total size of binaries: {round(size, 2)} {unit}') else: binaries = None serializer.write(obj, files=binaries, binaries_dir=item_dir) if binaries is not None: binaries_dir = os.path.join(export_dir, item_dir) os.makedirs(binaries_dir, exist_ok=True) for file in binaries: response = fcrepo.head(file.uri) accessed = parsedate(response.headers['Date']) modified = parsedate(response.headers['Last-Modified']) binary_filename = os.path.join(binaries_dir, str(file.filename)) with open(binary_filename, mode='wb') as binary: with file.source as stream: for chunk in stream: binary.write(chunk) # update the atime and mtime of the file to reflect the time of the # HTTP request and the resource's last-modified time in the repo os.utime(binary_filename, times=(mktime(accessed), mktime(modified))) logger.debug(f'Copied {file.uri} to {binary.name}') count += 1 except DataReadException as e: # log the failure, but continue to attempt to export the rest of the URIs logger.error(f'Export of {uri} failed: {e}') errors += 1 except (RESTAPIException, ConnectionError) as e: # log the failure, but continue to attempt to export the rest of the URIs logger.error(f'Unable to retrieve {uri}: {e}') errors += 1 # update the status now = datetime.now().timestamp() yield { 'time': { 'started': start_time, 'now': now, 'elapsed': now - start_time }, 'count': { 'total': total, 'exported': count, 'errors': errors } } try: serializer.finish() except EmptyItemListError: logger.error("No items could be exported; skipping writing file") logger.info(f'Exported {count} of {total} items') # save the BagIt bag to send to the output destination bag.save(manifests=True) # parse the output destination to determine where to send the export if args.output_dest.startswith('sftp:'): # send over SFTP to a remote host sftp_uri = urlsplit(args.output_dest) ssh_client = get_ssh_client(sftp_uri, key_filename=args.key) try: sftp_client = SFTPClient.from_transport( ssh_client.get_transport()) root, ext = splitext(basename(sftp_uri.path)) destination = sftp_client.open(sftp_uri.path, mode='w') except SSHException as e: raise FailureException(str(e)) from e else: # send to a local file zip_filename = args.output_dest root, ext = splitext(basename(zip_filename)) destination = zip_filename # write out a single ZIP file of the whole bag compress_bag(bag, destination, root) self.result = { 'type': 'export_complete' if count == total else 'partial_export', 'content_type': serializer.content_type, 'file_extension': serializer.file_extension, 'count': { 'total': total, 'exported': count, 'errors': errors } }
def __call__(self, fcrepo, args): try: fcrepo.test_connection() except ConnectionError as e: raise FailureException(str(e)) from e
def main(): """Parse args and handle options.""" parser = ArgumentParser(prog='plastron', description='Batch operation tool for Fedora 4.') parser.set_defaults(cmd_name=None) common_required = parser.add_mutually_exclusive_group(required=True) common_required.add_argument('-r', '--repo', help='Path to repository configuration file.', action='store') common_required.add_argument('-c', '--config', help='Path to configuration file.', action='store', dest='config_file', type=FileType('r')) common_required.add_argument('-V', '--version', help='Print version and exit.', action='version', version=version) parser.add_argument('-v', '--verbose', help='increase the verbosity of the status output', action='store_true') parser.add_argument('-q', '--quiet', help='decrease the verbosity of the status output', action='store_true') parser.add_argument('--on-behalf-of', help='delegate repository operations to this username', dest='delegated_user', action='store') subparsers = parser.add_subparsers(title='commands') command_modules = load_commands(subparsers) # parse command line args args = parser.parse_args() # if no subcommand was selected, display the help if args.cmd_name is None: parser.print_help() sys.exit(0) if args.config_file is not None: # new-style, combined config file (a la plastron.daemon) config = envsubst(yaml.safe_load(args.config_file)) repo_config = config['REPOSITORY'] broker_config = config.get('MESSAGE_BROKER', None) command_config = config.get('COMMANDS', {}) else: # old-style, repository-only config file with open(args.repo, 'r') as repo_config_file: repo_config = yaml.safe_load(repo_config_file) broker_config = None command_config = {} fcrepo = Repository(repo_config, ua_string=f'plastron/{version}', on_behalf_of=args.delegated_user) if broker_config is not None: broker = Broker(broker_config) else: broker = None # get basic logging options if 'LOGGING_CONFIG' in repo_config: with open(repo_config.get('LOGGING_CONFIG'), 'r') as logging_config_file: logging_options = yaml.safe_load(logging_config_file) else: logging_options = DEFAULT_LOGGING_OPTIONS # log file configuration log_dirname = repo_config.get('LOG_DIR') if not os.path.isdir(log_dirname): os.makedirs(log_dirname) log_filename = 'plastron.{0}.{1}.log'.format(args.cmd_name, now) logfile = os.path.join(log_dirname, log_filename) logging_options['handlers']['file']['filename'] = logfile # manipulate console verbosity if args.verbose: logging_options['handlers']['console']['level'] = 'DEBUG' elif args.quiet: logging_options['handlers']['console']['level'] = 'WARNING' # configure logging logging.config.dictConfig(logging_options) # get the selected subcommand command_module = command_modules[args.cmd_name] try: if hasattr(command_module, 'Command'): command = command_module.Command( config=command_config.get(args.cmd_name.upper())) command.repo = fcrepo command.broker = broker else: raise FailureException( f'Unable to execute command {args.cmd_name}') # dispatch to the selected subcommand print_header(args) logger.info( f'Loaded repo configuration from {args.repo or args.config_file.name}' ) if args.delegated_user is not None: logger.info( f'Running repository operations on behalf of {args.delegated_user}' ) command(fcrepo, args) print_footer(args) except FailureException as e: # something failed, exit with non-zero status logger.error(str(e)) sys.exit(1) except KeyboardInterrupt: # aborted due to Ctrl+C sys.exit(2)
def __init__(self, job: ImportJob, limit: int = None, percentage: int = None): self.job = job self.limit = limit self.metadata_file = None try: self.metadata_file = open(job.metadata_filename, 'r') except FileNotFoundError as e: raise MetadataError( job, f'Cannot read source file "{job.metadata_filename}: {e}' ) from e self.csv_file = csv.DictReader(self.metadata_file) try: self.fields = build_fields(self.fieldnames, self.model_class) except DataReadException as e: raise FailureException(str(e)) from e self.validation_reports: List[Mapping] = [] self.skipped = 0 self.subset_to_load = None self.total = None self.rows = 0 self.errors = 0 self.valid = 0 self.invalid = 0 self.created = 0 self.updated = 0 self.unchanged = 0 self.files = 0 if self.metadata_file.seekable(): # get the row count of the file, then rewind the CSV file self.total = sum(1 for _ in self.csv_file) self._rewind_csv_file() else: # file is not seekable, so we can't get a row count in advance self.total = None if percentage is not None: if not self.metadata_file.seekable(): raise FailureException( 'Cannot execute a percentage load using a non-seekable file' ) identifier_column = self.model_class.HEADER_MAP['identifier'] identifiers = [ row[identifier_column] for row in self.csv_file if row[identifier_column] not in job.completed_log ] self._rewind_csv_file() if len(identifiers) == 0: logger.info('No items remaining to load') self.subset_to_load = [] else: target_count = int(((percentage / 100) * self.total)) logger.info( f'Attempting to load {target_count} items ({percentage}% of {self.total})' ) if len(identifiers) > target_count: # evenly space the items to load among the remaining items step_size = int( (100 * (1 - (len(job.completed_log) / self.total))) / percentage) else: # load all remaining items step_size = 1 self.subset_to_load = identifiers[::step_size]
def __enter__(self): try: self.begin() except TransactionError as e: raise FailureException(f'Transaction failed: {e}') return self