Пример #1
0
def addFileToTransfer(
    filePathRelativeToSIP,
    fileUUID,
    transferUUID,
    taskUUID,
    date,
    sourceType="ingestion",
    eventDetail="",
    use="original",
    originalLocation=None,
):
    if not originalLocation:
        originalLocation = filePathRelativeToSIP
    file_obj = insertIntoFiles(
        fileUUID,
        filePathRelativeToSIP,
        date,
        transferUUID=transferUUID,
        use=use,
        originalLocation=originalLocation,
    )
    insertIntoEvents(
        fileUUID=fileUUID,
        eventType=sourceType,
        eventDateTime=date,
        eventDetail=eventDetail,
        eventOutcome="",
        eventOutcomeDetailNote="",
    )
    addAccessionEvent(fileUUID, transferUUID, date)
    return file_obj
Пример #2
0
def insert_derivation_event(
    original_uuid,
    output_uuid,
    derivation_uuid,
    event_detail_output,
    outcome_detail_note,
    today=None,
):
    """Add the derivation link for preservation files and the event."""
    if today is None:
        today = timezone.now()
    # Add event information to current file
    databaseFunctions.insertIntoEvents(
        fileUUID=original_uuid,
        eventIdentifierUUID=derivation_uuid,
        eventType="normalization",
        eventDateTime=today,
        eventDetail=event_detail_output,
        eventOutcome="",
        eventOutcomeDetailNote=outcome_detail_note or "",
    )

    # Add linking information between files
    databaseFunctions.insertIntoDerivations(
        sourceFileUUID=original_uuid,
        derivedFileUUID=output_uuid,
        relatedEventUUID=derivation_uuid,
    )
 def test_insert_into_events(self):
     assert (Event.objects.filter(
         event_id="6a671050-81ec-11ea-b337-8f27e380aa54").count() == 0)
     databaseFunctions.insertIntoEvents(
         fileUUID="88c8f115-80bc-4da4-a1e6-0158f5df13b9",
         eventIdentifierUUID="6a671050-81ec-11ea-b337-8f27e380aa54",
     )
     assert (Event.objects.filter(
         event_id="6a671050-81ec-11ea-b337-8f27e380aa54").count() == 1)
 def test_insert_into_event_fetches_correct_agent_from_file(self):
     databaseFunctions.insertIntoEvents(
         fileUUID="88c8f115-80bc-4da4-a1e6-0158f5df13b9",
         eventIdentifierUUID="00e46dbc-81ec-11ea-bf23-eb8a0da7ab13",
     )
     agents = Event.objects.get(
         event_id="00e46dbc-81ec-11ea-bf23-eb8a0da7ab13").agents
     assert agents.count() == 2
     assert agents.get(id=1)
     assert agents.get(id=2)
Пример #5
0
def addAccessionEvent(fileUUID, transferUUID, date):
    transfer = Transfer.objects.get(uuid=transferUUID)
    if transfer.accessionid:
        eventOutcomeDetailNote = f"accession#{transfer.accessionid}"
        insertIntoEvents(
            fileUUID=fileUUID,
            eventType="registration",
            eventDateTime=date,
            eventDetail="",
            eventOutcome="",
            eventOutcomeDetailNote=eventOutcomeDetailNote,
        )
Пример #6
0
def call(jobs):
    event_queue = []

    for job in jobs:
        with job.JobContext(logger=logger):
            if not mcpclient_settings.VIRUS_SCANNING_ENABLED:
                job.set_status(0)
                continue
            job.set_status(scan_file(event_queue, *job.args[1:]))

    with transaction.atomic():
        for e in event_queue:
            insertIntoEvents(**e)
Пример #7
0
def write_premis_event(job, sip_uuid, checksum_type, event_outcome,
                       event_outcome_detail_note):
    """Write the AIP-level "fixity check" PREMIS event."""
    try:
        databaseFunctions.insertIntoEvents(
            fileUUID=sip_uuid,
            eventType="fixity check",
            eventDetail='program="python, bag"; module="hashlib.{}()"'.format(
                checksum_type),
            eventOutcome=event_outcome,
            eventOutcomeDetailNote=event_outcome_detail_note,
        )
    except Exception as err:
        job.pyprint(f"Failed to write PREMIS event to database. Error: {err}")
    else:
        return event_outcome_detail_note
Пример #8
0
def insert_transcription_event(status, file_uuid, rule, relative_location):
    outcome = "transcribed" if status == 0 else "not transcribed"

    tool = rule.command.tool
    event_detail = 'program={}; version={}; command="{}"'.format(
        tool.description, tool.version, rule.command.command.replace('"', r"\"")
    )

    event_uuid = str(uuid4())

    databaseFunctions.insertIntoEvents(
        fileUUID=file_uuid,
        eventIdentifierUUID=event_uuid,
        eventType="transcription",
        eventDetail=event_detail,
        eventOutcome=outcome,
        eventOutcomeDetailNote=relative_location,
    )

    return event_uuid
Пример #9
0
def addFileToSIP(
    filePathRelativeToSIP,
    fileUUID,
    sipUUID,
    taskUUID,
    date,
    sourceType="ingestion",
    use="original",
):
    insertIntoFiles(fileUUID,
                    filePathRelativeToSIP,
                    date,
                    sipUUID=sipUUID,
                    use=use)
    insertIntoEvents(
        fileUUID=fileUUID,
        eventType=sourceType,
        eventDateTime=date,
        eventDetail="",
        eventOutcome="",
        eventOutcomeDetailNote="",
    )
Пример #10
0
def write_identification_event(file_uuid, puid=None, success=True):
    event_detail_text = 'program="{}"; version="{}"'.format(
        TOOL_DESCRIPTION, TOOL_VERSION
    )
    if success:
        event_outcome_text = "Positive"
    else:
        event_outcome_text = "Not identified"

    if not puid or puid == "UNKNOWN":
        puid = "No Matching Format"

    date = getUTCDate()

    insertIntoEvents(
        fileUUID=file_uuid,
        eventIdentifierUUID=str(uuid.uuid4()),
        eventType="format identification",
        eventDateTime=date,
        eventDetail=event_detail_text,
        eventOutcome=event_outcome_text,
        eventOutcomeDetailNote=puid,
    )
Пример #11
0
def updateSizeAndChecksum(
    fileUUID,
    filePath,
    date,
    eventIdentifierUUID,
    fileSize=None,
    checksum=None,
    checksumType=None,
    add_event=True,
):
    """
    Update a File with its size, checksum and checksum type. These are
    parameters that can be either generated or provided via keywords.

    Finally, insert the corresponding Event. This behavior can be cancelled
    using the boolean keyword 'add_event'.
    """
    if not fileSize:
        fileSize = os.path.getsize(filePath)
    if not checksumType:
        checksumType = django_settings.DEFAULT_CHECKSUM_ALGORITHM
    if not checksum:
        checksum = get_file_checksum(filePath, checksumType)

    File.objects.filter(uuid=fileUUID).update(size=fileSize,
                                              checksum=checksum,
                                              checksumtype=checksumType)

    if add_event:
        insertIntoEvents(
            fileUUID=fileUUID,
            eventType="message digest calculation",
            eventDateTime=date,
            eventDetail=f'program="python"; module="hashlib.{checksumType}()"',
            eventOutcomeDetailNote=checksum,
        )
Пример #12
0
def compress_aip(job, compression, compression_level, sip_directory, sip_name,
                 sip_uuid):
    """Compresses AIP according to compression algorithm and level.
    compression = AIP compression algorithm, format: <program>-<algorithm>, eg. 7z-lzma, pbzip2-
    compression_level = AIP compression level, integer between 1 and 9 inclusive
    sip_directory = Absolute path to the directory where the SIP is
    sip_name = User-provided name of the SIP
    sip_uuid = SIP UUID

    Example inputs:
    compressAIP.py
        7z-lzma
        5
        %sharedDirectory%/watchedDirectories/workFlowDecisions/compressionAIPDecisions/ep-d87d5845-bd07-4200-b1a4-928e0cb6e1e4/
        ep
        d87d5845-bd07-4200-b1a4-928e0cb6e1e4
    """
    if compression_level == "0":
        compression_level = "1"

    # Default is uncompressed.
    compression = int(compression)
    ProcessingConfig.AIPCompressionAlgorithm.Name(compression)
    if compression == ProcessingConfig.AIP_COMPRESSION_ALGORITHM_UNSPECIFIED:
        compression = ProcessingConfig.AIP_COMPRESSION_ALGORITHM_UNCOMPRESSED

    # Translation to make compress_aip happy.
    mapping = {
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_UNCOMPRESSED: ("None", ""),
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_TAR: (
            "gzip",
            "tar.gzip",
        ),  # A3M-TODO: support
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_TAR_BZIP2:
        ("pbzip2", "pbzip2"),
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_TAR_GZIP:
        ("gzip", "tar.gzip"),
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_S7_COPY: ("7z", "copy"),
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_S7_BZIP2: ("7z", "bzip2"),
        ProcessingConfig.AIP_COMPRESSION_ALGORITHM_S7_LZMA: ("7z", "lzma"),
    }

    try:
        program, compression_algorithm = mapping[compression]
    except KeyError:
        msg = f"Invalid program-compression algorithm: {compression}"
        job.pyprint(msg, file=sys.stderr)
        return 255

    archive_path = f"{sip_name}-{sip_uuid}"
    uncompressed_location = sip_directory + archive_path

    # Even though no actual compression is taking place,
    # the location still needs to be set in the unit to ensure that the
    # %AIPFilename% variable is set appropriately.
    # Setting it to an empty string ensures the common
    # "%SIPDirectory%%AIPFilename%" pattern still points at the right thing.
    if program == "None":
        update_unit(sip_uuid, uncompressed_location)
        return 0

    job.pyprint("Compressing {} with {}, algorithm {}, level {}".format(
        uncompressed_location, program, compression_algorithm,
        compression_level))

    if program == "7z":
        compressed_location = uncompressed_location + ".7z"
        command = '/usr/bin/7z a -bd -t7z -y -m0={algorithm} -mx={level} -mta=on -mtc=on -mtm=on -mmt=on "{compressed_location}" "{uncompressed_location}"'.format(
            algorithm=compression_algorithm,
            level=compression_level,
            uncompressed_location=uncompressed_location,
            compressed_location=compressed_location,
        )
        tool_info_command = (
            r'echo program="7z"\; '
            r'algorithm="{}"\; '
            'version="`7z | grep Version`"'.format(compression_algorithm))
    elif program == "pbzip2":
        compressed_location = uncompressed_location + ".tar.bz2"
        command = '/bin/tar -c --directory "{sip_directory}" "{archive_path}" | /usr/bin/pbzip2 --compress -{level} > "{compressed_location}"'.format(
            sip_directory=sip_directory,
            archive_path=archive_path,
            level=compression_level,
            compressed_location=compressed_location,
        )
        tool_info_command = (
            r'echo program="pbzip2"\; '
            r'algorithm="{}"\; '
            'version="$((pbzip2 -V) 2>&1)"'.format(compression_algorithm))
    elif program == "gzip":
        compressed_location = uncompressed_location + ".tar.gz"
        command = '/bin/tar -c --directory "{sip_directory}" "{archive_path}" | /bin/gzip -{level} > "{compressed_location}"'.format(
            sip_directory=sip_directory,
            archive_path=archive_path,
            level=compression_level,
            compressed_location=compressed_location,
        )
        tool_info_command = (
            r'echo program="gzip"\; '
            r'algorithm="{}"\; '
            'version="$((gzip -V) 2>&1)"'.format(compression_algorithm))
    else:
        msg = f"Program {program} not recognized, exiting script prematurely."
        job.pyprint(msg, file=sys.stderr)
        return 255

    job.pyprint("Executing command:", command)
    exit_code, std_out, std_err = executeOrRun("bashScript",
                                               command,
                                               capture_output=True)
    job.write_output(std_out)
    job.write_error(std_err)

    # Add new AIP File
    file_uuid = sip_uuid
    databaseFunctions.insertIntoFiles(
        fileUUID=file_uuid,
        filePath=compressed_location.replace(sip_directory, "%SIPDirectory%",
                                             1),
        sipUUID=sip_uuid,
        use="aip",
    )

    # Add compression event
    job.pyprint("Tool info command:", tool_info_command)
    _, tool_info, tool_info_err = executeOrRun("bashScript",
                                               tool_info_command,
                                               capture_output=True)
    job.write_output(tool_info)
    job.write_error(tool_info_err)
    tool_output = f'Standard Output="{std_out}"; Standard Error="{std_err}"'
    databaseFunctions.insertIntoEvents(
        eventType="compression",
        eventDetail=tool_info,
        eventOutcomeDetailNote=tool_output,
        fileUUID=file_uuid,
    )

    update_unit(sip_uuid, compressed_location)

    return exit_code
Пример #13
0
 def _execute_rule_command(self, rule):
     """Execute the FPR command of FPR rule ``rule`` against the file passed
     in to this client script. The output of that command determines what we
     print to stdout and stderr, and the nature of the validation event that
     we save to the db. We also copy the MediaConch policy file to the logs/
     directory of the AIP if it has not already been copied there.
     """
     result = "passed"
     command_to_execute, args = self._get_command_to_execute(rule)
     self.job.pyprint("Running", rule.command.description)
     exitstatus, stdout, stderr = executeOrRun(
         rule.command.script_type,
         command_to_execute,
         arguments=args,
         printing=False,
         capture_output=True,
     )
     try:
         output = json.loads(stdout)
     except ValueError:
         logger.exception(
             "Unable to load an object from the malformed JSON: \n%s", stderr
         )
         raise
     if self.file_type in ("preservation", "original"):
         self._save_to_logs_dir(output)
     if exitstatus == 0:
         self.job.pyprint(
             "Command {} completed with output {}".format(
                 rule.command.description, stdout
             )
         )
     else:
         self.job.print_error(
             "Command {} failed with exit status {}; stderr:".format(
                 rule.command.description, exitstatus
             ),
             stderr,
         )
         return "failed"
     event_detail = (
         'program="{tool.description}";'
         ' version="{tool.version}"'.format(tool=rule.command.tool)
     )
     if output.get("eventOutcomeInformation") != "pass":
         self.job.print_error(
             "Command {descr} returned a non-pass outcome "
             "for the policy check;\n\noutcome: "
             "{outcome}\n\ndetails: {details}.".format(
                 descr=rule.command.description,
                 outcome=output.get("eventOutcomeInformation"),
                 details=output.get("eventOutcomeDetailNote"),
             )
         )
         result = "failed"
     self.job.pyprint(
         "Creating policy checking event for {} ({})".format(
             self.file_path, self.file_uuid
         )
     )
     # Manually-normalized access derivatives have no file UUID so we can't
     # create a validation event for them. TODO/QUESTION: should we use the
     # UUID that was assigned to the manually normalized derivative during
     # transfer, i.e., the one that we retrieve in
     # ``_get_manually_normalized_access_derivative_file_uuid`` above?
     if not self.is_manually_normalized_access_derivative:
         databaseFunctions.insertIntoEvents(
             fileUUID=self.file_uuid,
             eventType="validation",  # From PREMIS controlled vocab.
             eventDetail=event_detail,
             eventOutcome=output.get("eventOutcomeInformation"),
             eventOutcomeDetailNote=output.get("eventOutcomeDetailNote"),
         )
     return result
def main(job):
    # "%SIPUUID%" "%SIPName%" "%SIPDirectory%" "%fileUUID%" "%filePath%"
    # job.args[2] (SIPName) is unused.
    SIPUUID = job.args[1]
    SIPDirectory = job.args[3]
    fileUUID = job.args[4]
    filePath = job.args[5]
    date = job.args[6]

    # Search for original file associated with preservation file given in filePath
    filePathLike = filePath.replace(
        os.path.join(SIPDirectory, "objects", "manualNormalization",
                     "preservation"),
        "%SIPDirectory%objects",
        1,
    )
    i = filePathLike.rfind(".")
    k = os.path.basename(filePath).rfind(".")
    if i != -1 and k != -1:
        filePathLike = filePathLike[:i + 1]
        # Matches "path/to/file/filename." Includes . so it doesn't false match foobar.txt when we wanted foo.txt
        filePathLike1 = filePathLike
        # Matches the exact filename.  For files with no extension.
        filePathLike2 = filePathLike[:-1]

    try:
        path_condition = Q(currentlocation__startswith=filePathLike1) | Q(
            currentlocation=filePathLike2)
        original_file = File.objects.get(
            path_condition,
            removedtime__isnull=True,
            filegrpuse="original",
            sip_id=SIPUUID,
        )
    except (File.DoesNotExist, File.MultipleObjectsReturned) as e:
        # Original file was not found, or there is more than one original file with
        # the same filename (differing extensions)
        # Look for a CSV that will specify the mapping
        csv_path = os.path.join(SIPDirectory, "objects", "manualNormalization",
                                "normalization.csv")
        if os.path.isfile(csv_path):
            try:
                preservation_file = filePath[
                    filePath.index("manualNormalization/preservation/"):]
            except ValueError:
                job.print_error(
                    f"{filePath} not in manualNormalization directory")
                return 4
            original = fileOperations.findFileInNormalizationCSV(
                csv_path,
                "preservation",
                preservation_file,
                SIPUUID,
                printfn=job.pyprint,
            )
            if original is None:
                if isinstance(e, File.DoesNotExist):
                    job.print_error("No matching file for: {}".format(
                        filePath.replace(SIPDirectory, "%SIPDirectory%")))
                    return 3
                else:
                    job.print_error(
                        "Could not find {preservation_file} in {filename}".
                        format(preservation_file=preservation_file,
                               filename=csv_path))
                    return 2
            # If we found the original file, retrieve it from the DB
            original_file = File.objects.get(
                removedtime__isnull=True,
                filegrpuse="original",
                originallocation__endswith=original,
                sip_id=SIPUUID,
            )
        else:
            if isinstance(e, File.DoesNotExist):
                job.print_error(
                    "No matching file for: ",
                    filePath.replace(SIPDirectory, "%SIPDirectory%", 1),
                )
                return 3
            elif isinstance(e, File.MultipleObjectsReturned):
                job.print_error(
                    "Too many possible files for: ",
                    filePath.replace(SIPDirectory, "%SIPDirectory%", 1),
                )
                return 2

    # We found the original file somewhere above
    job.print_output(
        "Matched original file %s (%s) to  preservation file %s (%s)" %
        (original_file.currentlocation, original_file.uuid, filePath,
         fileUUID))
    # Generate the new preservation path: path/to/original/filename-uuid.ext
    basename = os.path.basename(filePath)
    i = basename.rfind(".")
    dstFile = basename[:i] + "-" + fileUUID + basename[i:]
    dstDir = os.path.dirname(
        original_file.currentlocation.replace("%SIPDirectory%", SIPDirectory,
                                              1))
    dst = os.path.join(dstDir, dstFile)
    dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1)

    if os.path.exists(dst):
        job.print_error("already exists:", dstR)
        return 2

    # Rename the preservation file
    job.print_output("Renaming preservation file", filePath, "to", dst)
    os.rename(filePath, dst)
    # Update the preservation file's location
    File.objects.filter(uuid=fileUUID).update(currentlocation=dstR)

    try:
        # Normalization event already exists, so just update it
        # fileUUID, eventIdentifierUUID, eventType, eventDateTime, eventDetail
        # probably already correct, and we only set eventOutcomeDetailNote here
        # Not using .filter().update() because that doesn't generate an exception
        event = Event.objects.get(event_type="normalization",
                                  file_uuid=original_file)
        event.event_outcome_detail = dstR
        event.save()
        job.print_output(
            "Updated the eventOutcomeDetailNote of an existing normalization"
            " Event for file {}. Not creating a Derivation object".format(
                fileUUID))
    except Event.DoesNotExist:
        # No normalization event was created in normalize.py - probably manually
        # normalized during Ingest
        derivationEventUUID = str(uuid.uuid4())
        databaseFunctions.insertIntoEvents(
            fileUUID=original_file.uuid,
            eventIdentifierUUID=derivationEventUUID,
            eventType="normalization",
            eventDateTime=date,
            eventDetail="manual normalization",
            eventOutcome="",
            eventOutcomeDetailNote=dstR,
        )
        job.print_output(
            "Created a manual normalization Event for file {}.".format(
                original_file.uuid))

        # Add linking information between files
        # Assuming that if an event already exists, then the derivation does as well
        databaseFunctions.insertIntoDerivations(
            sourceFileUUID=original_file.uuid,
            derivedFileUUID=fileUUID,
            relatedEventUUID=derivationEventUUID,
        )
        job.print_output(
            "Created a Derivation for original file {}, derived file {}, and"
            " event {}".format(original_file.uuid, fileUUID,
                               derivationEventUUID))

    return 0
Пример #15
0
 def _execute_rule_command(self, rule):
     """Run the command against the file and return either 'passed' or
     'failed'. If the command errors or determines that the file is invalid,
     return 'failed'. Non-errors will result in the creation of an Event
     model in the db. Preservation derivative validation will result in the
     stdout from the command being saved to disk within the unit (i.e., SIP).
     """
     result = "passed"
     if rule.command.script_type in ("bashScript", "command"):
         command_to_execute = replace_string_values(
             rule.command.command,
             file_=self.file_uuid,
             sip=self.sip_uuid,
             type_="file",
         )
         args = []
     else:
         command_to_execute = rule.command.command
         args = [self.file_path]
     self.job.print_output("Running", rule.command.description)
     exitstatus, stdout, stderr = executeOrRun(
         type=rule.command.script_type,
         text=command_to_execute,
         printing=False,
         arguments=args,
     )
     if exitstatus != 0:
         self.job.print_error(
             "Command {description} failed with exit status {status};"
             " stderr:".format(description=rule.command.description,
                               status=exitstatus))
         return "failed"
     # Parse output and generate an Event
     # TODO: Evaluating a python string from a user-definable script seems
     # insecure practice; should be JSON.
     output = ast.literal_eval(stdout)
     event_detail = ('program="{tool.description}";'
                     ' version="{tool.version}"'.format(
                         tool=rule.command.tool))
     # If the FPR command has not errored but the actual validation
     # determined that the file is not valid, then we want to both create a
     # validation event in the db and set ``failed`` to ``True`` because we
     # want the micro-service in the dashboard GUI to indicate "Failed".
     # NOTE: this requires that the stdout of all validation FPR commands be
     # a dict (preferably a JSON object) with an ``eventOutcomeInformation``
     # boolean attribute.
     if output.get("eventOutcomeInformation") == "pass":
         self.job.print_output(
             f'Command "{rule.command.description}" was successful')
     elif output.get("eventOutcomeInformation") == "partial pass":
         self.job.print_output(
             f'Command "{rule.command.description}" was partially successful'
         )
     else:
         self.job.pyprint(
             "Command {cmd_description} indicated failure with this"
             " output:\n\n{output}".format(
                 cmd_description=rule.command.description,
                 output=pformat(stdout)),
             file=sys.stderr,
         )
         result = "failed"
     if self.file_type == "preservation":
         self._save_stdout_to_logs_dir(output)
     self.job.print_output(
         "Creating {purpose} event for {file_path} ({file_uuid})".format(
             purpose=self.purpose,
             file_path=self.file_path,
             file_uuid=self.file_uuid))
     databaseFunctions.insertIntoEvents(
         fileUUID=self.file_uuid,
         eventType="validation",  # From PREMIS controlled vocab.
         eventDetail=event_detail,
         eventOutcome=output.get("eventOutcomeInformation"),
         eventOutcomeDetailNote=output.get("eventOutcomeDetailNote"),
     )
     return result