def main(job, file_path, file_uuid, sip_uuid, shared_path, file_type): """Entry point for policy checker.""" setup_dicts(mcpclient_settings) policy_checker = PolicyChecker(job, file_path, file_uuid, sip_uuid, shared_path, file_type) return policy_checker.check()
def main(task_uuid, file_uuid): setup_dicts(mcpclient_settings) succeeded = True file_ = File.objects.get(uuid=file_uuid) # Normally we don't transcribe derivatives (access copies, preservation copies); # however, some useful transcription tools can't handle some formats that # are common as the primary copies. For example, tesseract can't handle JPEG2000. # If there are no rules for the primary format passed in, try to look at each # derivative until a transcribable derivative is found. # # Skip derivatives to avoid double-scanning them; only look at them as a fallback. if file_.filegrpuse != "original": print('{} is not an original; not transcribing'.format(file_uuid), file=sys.stderr) return 0 rules = fetch_rules_for(file_) if not rules: file_, rules = fetch_rules_for_derivatives(file_) if not rules: print('No rules found for file {} and its derivatives; not transcribing'.format(file_uuid), file=sys.stderr) return 0 else: if file_.filegrpuse == "original": noun = "original" else: noun = file_.filegrpuse + " derivative" print('Transcribing {} {}'.format(noun, file_.uuid), file=sys.stderr) rd = ReplacementDict.frommodel(file_=file_, type_='file') for rule in rules: script = rule.command.command if rule.command.script_type in ('bashScript', 'command'): script, = rd.replace(script) args = [] else: args = rd.to_gnu_options exitstatus, stdout, stderr = executeOrRun(rule.command.script_type, script, arguments=args) if exitstatus != 0: succeeded = False output_path = rd.replace(rule.command.output_location)[0] relative_path = output_path.replace(rd['%SIPDirectory%'], '%SIPDirectory%') event = insert_transcription_event(exitstatus, file_uuid, rule, relative_path) if os.path.isfile(output_path): insert_file_into_database(file_uuid, rd['%SIPUUID%'], event, rule, output_path, relative_path) return 0 if succeeded else 1
def main(file_path, file_uuid, sip_uuid, shared_path, file_type): setup_dicts(mcpclient_settings) policy_checker = PolicyChecker( file_path, file_uuid, sip_uuid, shared_path, file_type) return policy_checker.check()
def main(job, file_path, file_uuid, sip_uuid): setup_dicts(mcpclient_settings) failed = False # Check to see whether the file has already been characterized; don't try # to characterize it a second time if so. if FPCommandOutput.objects.filter(file_id=file_uuid).count() > 0: return 0 try: format = FormatVersion.active.get( fileformatversion__file_uuid=file_uuid) except FormatVersion.DoesNotExist: rules = format = None if format: rules = FPRule.active.filter(format=format.uuid, purpose="characterization") # Characterization always occurs - if nothing is specified, get one or more # defaults specified in the FPR. if not rules: rules = FPRule.active.filter(purpose="default_characterization") for rule in rules: if (rule.command.script_type == "bashScript" or rule.command.script_type == "command"): args = [] command_to_execute = replace_string_values(rule.command.command, file_=file_uuid, sip=sip_uuid, type_="file") else: rd = ReplacementDict.frommodel(file_=file_uuid, sip=sip_uuid, type_="file") args = rd.to_gnu_options() command_to_execute = rule.command.command exitstatus, stdout, stderr = executeOrRun( rule.command.script_type, command_to_execute, arguments=args, capture_output=True, ) job.write_output(stdout) job.write_error(stderr) if exitstatus != 0: job.write_error( "Command {} failed with exit status {}; stderr:".format( rule.command.description, exitstatus)) failed = True continue # fmt/101 is XML - we want to collect and package any XML output, while # allowing other commands to execute without actually collecting their # output in the event that they are writing their output to disk. # FPCommandOutput can have multiple rows for a given file, # distinguished by the rule that produced it. if (rule.command.output_format and rule.command.output_format.pronom_id == "fmt/101"): try: etree.fromstring(stdout) insertIntoFPCommandOutput(file_uuid, stdout, rule.uuid) job.write_output( 'Saved XML output for command "{}" ({})'.format( rule.command.description, rule.command.uuid)) except etree.XMLSyntaxError: failed = True job.write_error( 'XML output for command "{}" ({}) was not valid XML; not saving to database' .format(rule.command.description, rule.command.uuid)) else: job.write_error( 'Tool output for command "{}" ({}) is not XML; not saving to database' .format(rule.command.description, rule.command.uuid)) if failed: return 255 else: return 0
def main(job, file_path, file_uuid, sip_uuid, shared_path, file_type): setup_dicts(mcpclient_settings) validator = Validator(job, file_path, file_uuid, sip_uuid, shared_path, file_type) return validator.validate()
def main(opts): """ Find and execute normalization commands on input file. """ # TODO fix for maildir working only on attachments setup_dicts(mcpclient_settings) # Find the file and it's FormatVersion (file identification) try: file_ = File.objects.get(uuid=opts.file_uuid) except File.DoesNotExist: print('File with uuid', opts.file_uuid, 'does not exist in database.', file=sys.stderr) return NO_RULE_FOUND print('File found:', file_.uuid, file_.currentlocation) # Unless normalization file group use is submissionDocumentation, skip the # submissionDocumentation directory if opts.normalize_file_grp_use != "submissionDocumentation" and file_.currentlocation.startswith( '%SIPDirectory%objects/submissionDocumentation'): print('File', os.path.basename(opts.file_path), 'in objects/submissionDocumentation, skipping') return SUCCESS # Only normalize files where the file's group use and normalize group use match if file_.filegrpuse != opts.normalize_file_grp_use: print(os.path.basename(opts.file_path), 'is file group usage', file_.filegrpuse, 'instead of ', opts.normalize_file_grp_use, ' - skipping') return SUCCESS # For re-ingest: clean up old derivations # If the file already has a Derivation with the same purpose, remove it and mark the derived file as deleted derivatives = Derivation.objects.filter( source_file=file_, derived_file__filegrpuse=opts.purpose) for derivative in derivatives: print(opts.purpose, 'derivative', derivative.derived_file_id, 'already exists, marking as deleted') File.objects.filter(uuid=derivative.derived_file_id).update( filegrpuse='deleted') # Don't create events for thumbnail files if opts.purpose != 'thumbnail': databaseFunctions.insertIntoEvents( fileUUID=derivative.derived_file_id, eventType='deletion', ) derivatives.delete() # If a file has been manually normalized for this purpose, skip it manually_normalized_file = check_manual_normalization(opts) if manually_normalized_file: print(os.path.basename(opts.file_path), 'was already manually normalized into', manually_normalized_file.currentlocation) if 'preservation' in opts.purpose: # Add derivation link and associated event insert_derivation_event( original_uuid=opts.file_uuid, output_uuid=manually_normalized_file.uuid, derivation_uuid=str(uuid.uuid4()), event_detail_output="manual normalization", outcome_detail_note=None, ) return SUCCESS do_fallback = False format_id = get_object_or_None(FileFormatVersion, file_uuid=opts.file_uuid) # Look up the normalization command in the FPR if format_id: print('File format:', format_id.format_version) try: rule = FPRule.active.get(format=format_id.format_version, purpose=opts.purpose) except FPRule.DoesNotExist: do_fallback = True # Try with default rule if no format_id or rule was found if format_id is None or do_fallback: try: rule = get_default_rule(opts.purpose) print(os.path.basename(file_.currentlocation), "not identified or without rule", "- Falling back to default", opts.purpose, "rule") except FPRule.DoesNotExist: print('Not normalizing', os.path.basename(file_.currentlocation), ' - No rule or default rule found to normalize for', opts.purpose) return NO_RULE_FOUND print('Format Policy Rule:', rule) command = rule.command print('Format Policy Command', command.description) replacement_dict = get_replacement_dict(opts) cl = transcoder.CommandLinker(rule, command, replacement_dict, opts, once_normalized) exitstatus = cl.execute() # If the access/thumbnail normalization command has errored AND a # derivative was NOT created, then we run the default access/thumbnail # rule. Note that we DO need to check if the derivative file exists. Even # when a verification command exists for the normalization command, the # transcoder.py::Command.execute method will only run the verification # command if the normalization command returns a 0 exit code. # Errored thumbnail normalization also needs to result in default thumbnail # normalization; if not, then a transfer with a single file that failed # thumbnail normalization will result in a failed SIP at "Prepare DIP: Copy # thumbnails to DIP directory" if (exitstatus != 0 and opts.purpose in ('access', 'thumbnail') and cl.commandObject.output_location and (not os.path.isfile(cl.commandObject.output_location))): # Fall back to default rule try: fallback_rule = get_default_rule(opts.purpose) print(opts.purpose, 'normalization failed, falling back to default', opts.purpose, 'rule') except FPRule.DoesNotExist: print('Not retrying normalizing for', os.path.basename(file_.currentlocation), ' - No default rule found to normalize for', opts.purpose) fallback_rule = None # Don't re-run the same command if fallback_rule and fallback_rule.command != command: print('Fallback Format Policy Rule:', fallback_rule) command = fallback_rule.command print('Fallback Format Policy Command', command.description) # Use existing replacement dict cl = transcoder.CommandLinker(fallback_rule, command, replacement_dict, opts, once_normalized) exitstatus = cl.execute() # Store thumbnails locally for use during AIP searches # TODO is this still needed, with the storage service? if 'thumbnail' in opts.purpose: thumbnail_filepath = cl.commandObject.output_location thumbnail_storage_dir = os.path.join( mcpclient_settings.SHARED_DIRECTORY, 'www', 'thumbnails', opts.sip_uuid, ) try: os.makedirs(thumbnail_storage_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir( thumbnail_storage_dir): pass else: raise thumbnail_basename, thumbnail_extension = os.path.splitext( thumbnail_filepath) thumbnail_storage_file = os.path.join( thumbnail_storage_dir, opts.file_uuid + thumbnail_extension, ) shutil.copyfile(thumbnail_filepath, thumbnail_storage_file) if not exitstatus == 0: print('Command', command.description, 'failed!', file=sys.stderr) return RULE_FAILED else: print('Successfully normalized ', os.path.basename(opts.file_path), 'for', opts.purpose) return SUCCESS