示例#1
0
def insert_derivation_event(original_uuid,
                            output_uuid,
                            derivation_uuid,
                            event_detail_output,
                            outcome_detail_note,
                            today=None):
    """ Add the derivation link for preservation files and the event. """
    if today is None:
        today = timezone.now()
    # Add event information to current file
    databaseFunctions.insertIntoEvents(
        fileUUID=original_uuid,
        eventIdentifierUUID=derivation_uuid,
        eventType="normalization",
        eventDateTime=today,
        eventDetail=event_detail_output,
        eventOutcome="",
        eventOutcomeDetailNote=outcome_detail_note or "",
    )

    # Add linking information between files
    databaseFunctions.insertIntoDerivations(
        sourceFileUUID=original_uuid,
        derivedFileUUID=output_uuid,
        relatedEventUUID=derivation_uuid,
    )
示例#2
0
def main(job, shared_path, file_uuid, file_path, date, event_uuid):
    try:
        file_ = File.objects.get(uuid=file_uuid)
    except File.DoesNotExist:
        logger.exception("File with UUID %s cannot be found.", file_uuid)
        return 1

    # See if it's a Transfer and in particular a Archivematica AIP transfer.
    # If so, try to extract the size, checksum and checksum function from the
    # original METS document.
    kw = {}
    if (file_.transfer and (not file_.sip)
            and file_.transfer.type == "Archivematica AIP"):
        info = get_file_info_from_mets(job, shared_path, file_)
        kw.update(
            fileSize=info["file_size"],
            checksum=info["checksum"],
            checksumType=info["checksum_type"],
            add_event=False,
        )
        if info.get("derivation"):
            insertIntoDerivations(sourceFileUUID=file_uuid,
                                  derivedFileUUID=info["derivation"])
        if info.get("format_version"):
            FileFormatVersion.objects.create(
                file_uuid_id=file_uuid, format_version=info["format_version"])

    updateSizeAndChecksum(file_uuid, file_path, date, event_uuid, **kw)

    return 0
def insert_file_into_database(file_uuid, sip_uuid, event_uuid, rule, output_path, relative_path):
    transcription_uuid = str(uuid4())
    today = timezone.now()
    fileOperations.addFileToSIP(
        relative_path,
        transcription_uuid,
        sip_uuid,
        task_uuid,
        today,
        sourceType="creation",
        use="text/ocr"
    )

    fileOperations.updateSizeAndChecksum(
        transcription_uuid,
        output_path,
        today,
        str(uuid4())
    )

    databaseFunctions.insertIntoDerivations(
        sourceFileUUID=file_uuid,
        derivedFileUUID=transcription_uuid,
        relatedEventUUID=event_uuid
    )
示例#4
0
def create_db_entries(job, mapping, dataverse_agent_id):
    """
    Create derivation event and derivative entries for the tabular bundle data
    in the transfer.
    """
    for entry, file_entry in mapping.items():
        if entry.derived_from and entry.use == "derivative":
            original_uuid = mapping[entry.derived_from].uuid
            event_uuid = uuid.uuid4()
            try:
                databaseFunctions.insertIntoEvents(
                    original_uuid,
                    eventIdentifierUUID=event_uuid,
                    eventType="derivation",
                    eventDateTime=None,
                    eventDetail="",
                    eventOutcome="",
                    eventOutcomeDetailNote=file_entry.currentlocation,
                    agents=[dataverse_agent_id],
                )
                # Add derivation
                databaseFunctions.insertIntoDerivations(
                    sourceFileUUID=original_uuid,
                    derivedFileUUID=file_entry.uuid,
                    relatedEventUUID=event_uuid,
                )
                job.pyprint(
                    "Added derivation from", original_uuid, "to", file_entry.uuid
                )
            except django.db.IntegrityError:
                err_log = "Database integrity error, entry: {} for file {}".format(
                    file_entry.currentlocation, file_entry.originallocation
                )
                raise ParseDataverseError(err_log)
示例#5
0
def xmlCreateFileAssociationBetween(originalFileFullPath, outputFromNormalizationFileFullPath, SIPFullPath, sipUUID, eventDetailText, eventOutcomeDetailNote, outputFileUUID=""):
    #assign file UUID

    date = databaseInterface.getUTCDate()
    if outputFileUUID == "":
        outputFileUUID = uuid.uuid4().__str__()

    originalFilePathRelativeToSIP = originalFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1)
    sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string(originalFilePathRelativeToSIP) + "' AND Files.sipUUID = '" + sipUUID + "';"
    print sql
    rows = databaseInterface.queryAllSQL(sql)
    print rows
    fileUUID = rows[0][0]


    filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1)
    addFileToSIP(filePathRelativeToSIP, outputFileUUID, sipUUID, uuid.uuid4().__str__(), date, sourceType="creation", use="preservation")
    updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath, date, uuid.uuid4().__str__())

    taskUUID = uuid.uuid4().__str__()
    insertIntoEvents(fileUUID=fileUUID, \
               eventIdentifierUUID=taskUUID, \
               eventType="normalization", \
               eventDateTime=date, \
               eventDetail=eventDetailText, \
               eventOutcome="", \
               eventOutcomeDetailNote=eventOutcomeDetailNote)

    insertIntoDerivations(sourceFileUUID=fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=taskUUID)
def create_db_entries(job, mapping, dataverse_agent_id):
    """
    Create event and derivatives entries for the derived tabular data in the
    database.
    """
    for entry, file_entry in mapping.items():
        if entry.derived_from and entry.use == 'derivative':
            original_uuid = mapping[entry.derived_from].uuid
            event_uuid = uuid.uuid4()
            # Add event
            databaseFunctions.insertIntoEvents(
                original_uuid,
                eventIdentifierUUID=event_uuid,
                eventType="derivation",
                eventDateTime=None,  # From Dataverse?
                eventDetail="",  # From Dataverse?
                eventOutcome="",  # From Dataverse?
                eventOutcomeDetailNote=file_entry.currentlocation,
                agents=[dataverse_agent_id],
            )
            # Add derivation
            databaseFunctions.insertIntoDerivations(
                sourceFileUUID=original_uuid,
                derivedFileUUID=file_entry.uuid,
                relatedEventUUID=event_uuid,
            )
            job.pyprint(
                'Added derivation from', original_uuid, 'to', file_entry.uuid)
def onceNormalized(command, opts, replacementDic):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    if os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >> sys.stderr, command
        print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    eventDetail = ""
    if command.eventDetailCommand != None:
        eventDetail = eventDetail = command.eventDetailCommand.stdOut
    for ef in transcodedFiles:
        if opts["commandClassifications"] == "preservation":
            # Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(
                filePathRelativeToSIP,
                replacementDic["%outputFileUUID%"],
                opts["sipUUID"],
                uuid.uuid4().__str__(),
                opts["date"],
                sourceType="creation",
                use="preservation",
            )
            # Calculate new file checksum
            # Add event information to current file
            insertIntoEvents(
                fileUUID=opts["fileUUID"],
                eventIdentifierUUID=derivationEventUUID,
                eventType="normalization",
                eventDateTime=opts["date"],
                eventDetail=eventDetail,
                eventOutcome="",
                eventOutcomeDetailNote=filePathRelativeToSIP,
            )

            updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__())

            # Add linking information between files
            insertIntoDerivations(
                sourceFileUUID=opts["fileUUID"],
                derivedFileUUID=replacementDic["%outputFileUUID%"],
                relatedEventUUID=derivationEventUUID,
            )

            replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def onceNormalized(command):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    elif os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >>sys.stderr, command
        print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    for ef in transcodedFiles:
        global outputFileUUID
        global replacementDic
        global opts
        if opts.commandClassifications == "preservation":
            old = """xmlNormalize(outputFileUUID, \
                     ef, \
                     command.eventDetailCommand.stdOut, \
                     opts.fileUUID, \
                     opts.objectsDirectory, \
                     opts.taskUUID, \
                     opts.date, \
                     opts.logsDirectory, \
                     ) #    {normalized; not normalized}"""

            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP, outputFileUUID, opts.sipUUID, uuid.uuid4().__str__(), opts.date, sourceType="creation", use="preservation")
            #Calculate new file checksum
            print >>sys.stderr, "TODO: calculate new file checksum"
            #Add event information to current file
            insertIntoEvents(fileUUID=opts.fileUUID, \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts.date, \
               eventDetail=command.eventDetailCommand.stdOut, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(outputFileUUID, ef, opts.date, uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(sourceFileUUID=opts.fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=derivationEventUUID)

            outputFileUUID = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + outputFileUUID
def onceNormalized(command, opts, replacementDic):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    if os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >>sys.stderr, command
        print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk)
    if command.eventDetailCommand != None:
        eventDetail = '%s; %s' % (eventDetail, command.eventDetailCommand.stdOut)
    for ef in transcodedFiles:
        if opts["commandClassifications"] == "preservation":
            # TODO Add manual normalization for files of same name mapping
            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation")
            #Calculate new file checksum
            #Add event information to current file
            insertIntoEvents(fileUUID=opts["fileUUID"], \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts["date"], \
               eventDetail=eventDetail, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID)

            sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % (replacementDic["%outputFileUUID%"], command.outputFormat)
            databaseInterface.runSQL(sql)
            
            replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
示例#10
0
def xmlCreateFileAssociationBetween(originalFileFullPath,
                                    outputFromNormalizationFileFullPath,
                                    SIPFullPath,
                                    sipUUID,
                                    eventDetailText,
                                    eventOutcomeDetailNote,
                                    outputFileUUID=""):
    #assign file UUID

    date = databaseInterface.getUTCDate()
    if outputFileUUID == "":
        outputFileUUID = uuid.uuid4().__str__()

    originalFilePathRelativeToSIP = originalFileFullPath.replace(
        SIPFullPath, "%SIPDirectory%", 1)
    sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string(
        originalFilePathRelativeToSIP
    ) + "' AND Files.sipUUID = '" + sipUUID + "';"
    print sql
    rows = databaseInterface.queryAllSQL(sql)
    print rows
    fileUUID = rows[0][0]

    filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace(
        SIPFullPath, "%SIPDirectory%", 1)
    addFileToSIP(filePathRelativeToSIP,
                 outputFileUUID,
                 sipUUID,
                 uuid.uuid4().__str__(),
                 date,
                 sourceType="creation",
                 use="preservation")
    updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath,
                          date,
                          uuid.uuid4().__str__())

    taskUUID = uuid.uuid4().__str__()
    insertIntoEvents(fileUUID=fileUUID, \
               eventIdentifierUUID=taskUUID, \
               eventType="normalization", \
               eventDateTime=date, \
               eventDetail=eventDetailText, \
               eventOutcome="", \
               eventOutcomeDetailNote=eventOutcomeDetailNote)

    insertIntoDerivations(sourceFileUUID=fileUUID,
                          derivedFileUUID=outputFileUUID,
                          relatedEventUUID=taskUUID)
示例#11
0
def update_files(sip_uuid, files):
    """
    Update file information to DB.

    :param sip_uuid: UUID of the SIP to parse the metadata for.
    :param files: List of dicts containing file info.
    """
    now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
    # Add information to the DB
    for file_info in files:
        # Add file & reingest event
        event_id = str(uuid.uuid4())
        fileOperations.addFileToSIP(
            filePathRelativeToSIP=file_info['original_path'],
            fileUUID=file_info['uuid'],
            sipUUID=sip_uuid,
            taskUUID=event_id,
            date=now,
            sourceType="reingestion",
            use=file_info['use'],
        )
        # Update other file info
        # This doesn't use updateSizeAndChecksum because it also updates currentlocation
        models.File.objects.filter(uuid=file_info['uuid']).update(
            checksum=file_info['checksum'],
            checksumtype=file_info['checksumtype'],
            size=file_info['size'],
            currentlocation=file_info['current_path']
        )
        if file_info['format_version']:
            # Add Format ID
            models.FileFormatVersion.objects.create(
                file_uuid_id=file_info['uuid'],
                format_version=file_info['format_version']
            )

    # Derivation info
    # Has to be separate loop, as derived file may not be in DB otherwise
    # May not need to be parsed, if Derivation info can be roundtripped in METS Reader/Writer
    for file_info in files:
        if file_info['derivation'] is None:
            continue
        databaseFunctions.insertIntoDerivations(
            sourceFileUUID=file_info['uuid'],
            derivedFileUUID=file_info['derivation'],
        )
示例#12
0
def create_mets_file(aic, aips):
    """ Create AIC METS file with AIP information. """

    # Prepare constants
    nsmap = {
        'mets': ns.metsNS,
        'xlink': ns.xlinkNS,
        'xsi': ns.xsiNS,
    }
    now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S")

    # Set up structure
    E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap)
    mets = (
        E.mets(
            E.metsHdr(CREATEDATE=now),
            E.dmdSec(
                E.mdWrap(
                    E.xmlData(),
                    MDTYPE="DC",  # mdWrap
                ),
                ID='dmdSec_1',  # dmdSec
            ),
            E.fileSec(E.fileGrp(), ),
            E.structMap(
                E.div(
                    TYPE="Archival Information Collection",
                    DMDID="dmdSec_1",
                ),
                TYPE='logical',  # structMap
            ),
        ))
    mets.attrib['{{{ns}}}schemaLocation'.format(
        ns=nsmap['xsi']
    )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd"

    # Add Dublin Core info
    xml_data = mets.find('mets:dmdSec/mets:mdWrap/mets:xmlData',
                         namespaces=ns.NSMAP)
    dublincore = archivematicaCreateMETS2.getDublinCore(
        archivematicaCreateMETS2.SIPMetadataAppliesToType, aic['uuid'])
    # Add <extent> with number of AIPs
    extent = etree.SubElement(dublincore, ns.dctermsBNS + 'extent')
    extent.text = "{} AIPs".format(len(aips))
    xml_data.append(dublincore)

    # Add elements for each AIP
    file_grp = mets.find('mets:fileSec/mets:fileGrp', namespaces=ns.NSMAP)
    struct_div = mets.find('mets:structMap/mets:div', namespaces=ns.NSMAP)
    for aip in aips:
        file_id = '{name}-{uuid}'.format(name=aip['name'], uuid=aip['uuid'])
        etree.SubElement(file_grp, ns.metsBNS + 'file', ID=file_id)

        label = aip['label'] or aip['name']
        div = etree.SubElement(struct_div, ns.metsBNS + 'div', LABEL=label)
        etree.SubElement(div, ns.metsBNS + 'fptr', FILEID=file_id)

    print etree.tostring(mets, pretty_print=True)

    # Write out the file
    file_uuid = str(uuid.uuid4())
    basename = os.path.join('metadata', "METS.{}.xml".format(file_uuid))
    filename = os.path.join(aic['dir'], basename)
    with open(filename, 'w') as f:
        f.write(etree.tostring(mets, pretty_print=True))
    fileOperations.addFileToSIP(
        filePathRelativeToSIP='%SIPDirectory%' + basename,
        fileUUID=file_uuid,
        sipUUID=aic['uuid'],
        taskUUID=str(uuid.uuid4()),  # Unsure what should go here
        date=now,
        sourceType="aip creation",
        use='metadata')
    # To make this work with the createMETS2 (for SIPs)
    databaseFunctions.insertIntoDerivations(file_uuid, file_uuid)

    # Insert the count of AIPs in the AIC into UnitVariables, so it can be
    # indexed later
    UnitVariable.objects.create(unittype="SIP",
                                unituuid=aic['uuid'],
                                variable="AIPsinAIC",
                                variablevalue=str(len(aips)))
def onceNormalized(command, opts, replacementDic):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    if os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >> sys.stderr, command
        print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk)
    if command.eventDetailCommand != None:
        eventDetail = '%s; %s' % (eventDetail,
                                  command.eventDetailCommand.stdOut)
    for ef in transcodedFiles:
        if opts["commandClassifications"] == "preservation":
            # TODO Add manual normalization for files of same name mapping
            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts["sipPath"],
                                               "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP,
                         replacementDic["%outputFileUUID%"],
                         opts["sipUUID"],
                         uuid.uuid4().__str__(),
                         opts["date"],
                         sourceType="creation",
                         use="preservation")
            #Calculate new file checksum
            #Add event information to current file
            insertIntoEvents(fileUUID=opts["fileUUID"], \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts["date"], \
               eventDetail=eventDetail, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef,
                                  opts["date"],
                                  uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(
                sourceFileUUID=opts["fileUUID"],
                derivedFileUUID=replacementDic["%outputFileUUID%"],
                relatedEventUUID=derivationEventUUID)

            sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % (
                replacementDic["%outputFileUUID%"], command.outputFormat)
            databaseInterface.runSQL(sql)

            replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__()
            replacementDic[
                "%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def create_mets_file(aic, aips, job):
    """ Create AIC METS file with AIP information. """

    # Prepare constants
    nsmap = {"mets": ns.metsNS, "xlink": ns.xlinkNS, "xsi": ns.xsiNS}
    now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S")

    # Set up structure
    E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap)
    mets = E.mets(
        E.metsHdr(CREATEDATE=now),
        E.dmdSec(E.mdWrap(E.xmlData(), MDTYPE="DC"),
                 ID="dmdSec_1"),  # mdWrap  # dmdSec
        E.fileSec(E.fileGrp()),
        E.structMap(
            E.div(TYPE="Archival Information Collection", DMDID="dmdSec_1"),
            TYPE="logical",  # structMap
        ),
    )
    mets.attrib["{{{ns}}}schemaLocation".format(
        ns=nsmap["xsi"]
    )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd"

    # Add Dublin Core info
    xml_data = mets.find("mets:dmdSec/mets:mdWrap/mets:xmlData",
                         namespaces=ns.NSMAP)
    dublincore = create_mets_v2.getDublinCore(
        create_mets_v2.SIPMetadataAppliesToType, aic["uuid"])
    # Add <extent> with number of AIPs
    extent = etree.SubElement(dublincore, ns.dctermsBNS + "extent")
    extent.text = "{} AIPs".format(len(aips))
    xml_data.append(dublincore)

    # Add elements for each AIP
    file_grp = mets.find("mets:fileSec/mets:fileGrp", namespaces=ns.NSMAP)
    struct_div = mets.find("mets:structMap/mets:div", namespaces=ns.NSMAP)
    for aip in aips:
        file_id = "{name}-{uuid}".format(name=aip["name"], uuid=aip["uuid"])
        etree.SubElement(file_grp, ns.metsBNS + "file", ID=file_id)

        label = aip["label"] or aip["name"]
        div = etree.SubElement(struct_div, ns.metsBNS + "div", LABEL=label)
        etree.SubElement(div, ns.metsBNS + "fptr", FILEID=file_id)

    job.pyprint(etree.tostring(mets, pretty_print=True))

    # Write out the file
    file_uuid = str(uuid.uuid4())
    basename = os.path.join("metadata", "METS.{}.xml".format(file_uuid))
    filename = os.path.join(aic["dir"], basename)
    with open(filename, "w") as f:
        f.write(
            etree.tostring(mets,
                           pretty_print=True,
                           xml_declaration=True,
                           encoding="utf-8"))
    fileOperations.addFileToSIP(
        filePathRelativeToSIP="%SIPDirectory%" + basename,
        fileUUID=file_uuid,
        sipUUID=aic["uuid"],
        taskUUID=str(uuid.uuid4()),  # Unsure what should go here
        date=now,
        sourceType="aip creation",
        use="metadata",
    )
    # To make this work with the createMETS2 (for SIPs)
    databaseFunctions.insertIntoDerivations(file_uuid, file_uuid)

    # Insert the count of AIPs in the AIC into UnitVariables, so it can be
    # indexed later
    UnitVariable.objects.create(
        unittype="SIP",
        unituuid=aic["uuid"],
        variable="AIPsinAIC",
        variablevalue=str(len(aips)),
    )
    e.event_outcome_detail = dstR
    e.save()
    print('Updated the eventOutcomeDetailNote of an existing normalization'
          ' Event for file {}. Not creating a Derivation object'.format(
              fileUUID))
except Event.DoesNotExist:
    # No normalization event was created in normalize.py - probably manually
    # normalized during Ingest
    derivationEventUUID = str(uuid.uuid4())
    databaseFunctions.insertIntoEvents(fileUUID=original_file.uuid,
                                       eventIdentifierUUID=derivationEventUUID,
                                       eventType="normalization",
                                       eventDateTime=date,
                                       eventDetail="manual normalization",
                                       eventOutcome="",
                                       eventOutcomeDetailNote=dstR)
    print('Created a manual normalization Event for file {}.'.format(
        original_file.uuid))

    # Add linking information between files
    # Assuming that if an event already exists, then the derivation does as well
    databaseFunctions.insertIntoDerivations(
        sourceFileUUID=original_file.uuid,
        derivedFileUUID=fileUUID,
        relatedEventUUID=derivationEventUUID)
    print('Created a Derivation for original file {}, derived file {}, and'
          ' event {}'.format(original_file.uuid, fileUUID,
                             derivationEventUUID))

exit(0)
basename = os.path.basename(filePath)
i = basename.rfind(".")
dstFile = basename[:i] + "-" + fileUUID + basename[i:] 
dstDir = os.path.dirname(originalFilePath.replace("%SIPDirectory%", SIPDirectory, 1))
dst = os.path.join(dstDir, dstFile)
dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1)

if os.path.isfile(dst) or os.path.isdir(dst):
    print >>sys.stderr, "already exists:", dstR
    exit(2)
    
#Rename the file or directory src to dst. If dst is a directory, OSError will be raised. On Unix, if dst exists and is a file, it will be replaced silently if the user has permission. The operation may fail on some Unix flavors if src and dst are on different filesystems.
#see http://docs.python.org/2/library/os.html
os.rename(filePath, dst)
sql =  """UPDATE Files SET currentLocation='%s' WHERE fileUUID='%s';""" % (dstR, fileUUID)
databaseInterface.runSQL(sql)

derivationEventUUID = uuid.uuid4().__str__()
insertIntoEvents(fileUUID=originalFileUUID, \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=date, \
               eventDetail="manual normalization", \
               eventOutcome="", \
               eventOutcomeDetailNote=dstR)

#Add linking information between files
insertIntoDerivations(sourceFileUUID=originalFileUUID, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID)


exit(0)
示例#17
0
def main(job):
    # "%SIPUUID%" "%SIPName%" "%SIPDirectory%" "%fileUUID%" "%filePath%"
    # job.args[2] (SIPName) is unused.
    SIPUUID = job.args[1]
    SIPDirectory = job.args[3]
    fileUUID = job.args[4]
    filePath = job.args[5]
    date = job.args[6]

    # Search for original file associated with preservation file given in filePath
    filePathLike = filePath.replace(
        os.path.join(SIPDirectory, "objects", "manualNormalization",
                     "preservation"), "%SIPDirectory%objects", 1)
    i = filePathLike.rfind(".")
    k = os.path.basename(filePath).rfind(".")
    if i != -1 and k != -1:
        filePathLike = filePathLike[:i + 1]
        # Matches "path/to/file/filename." Includes . so it doesn't false match foobar.txt when we wanted foo.txt
        filePathLike1 = filePathLike
        # Matches the exact filename.  For files with no extension.
        filePathLike2 = filePathLike[:-1]

    try:
        path_condition = Q(currentlocation__startswith=filePathLike1) | Q(
            currentlocation=filePathLike2)
        original_file = File.objects.get(path_condition,
                                         removedtime__isnull=True,
                                         filegrpuse="original",
                                         sip_id=SIPUUID)
    except (File.DoesNotExist, File.MultipleObjectsReturned) as e:
        # Original file was not found, or there is more than one original file with
        # the same filename (differing extensions)
        # Look for a CSV that will specify the mapping
        csv_path = os.path.join(SIPDirectory, "objects", "manualNormalization",
                                "normalization.csv")
        if os.path.isfile(csv_path):
            try:
                preservation_file = filePath[
                    filePath.index('manualNormalization/preservation/'):]
            except ValueError:
                job.print_error(
                    "{0} not in manualNormalization directory".format(
                        filePath))
                return 4
            original = fileOperations.findFileInNormalizationCSV(
                csv_path,
                "preservation",
                preservation_file,
                SIPUUID,
                printfn=job.pyprint)
            if original is None:
                if isinstance(e, File.DoesNotExist):
                    job.print_error("No matching file for: {0}".format(
                        filePath.replace(SIPDirectory, "%SIPDirectory%")))
                    return 3
                else:
                    job.print_error(
                        "Could not find {preservation_file} in {filename}".
                        format(preservation_file=preservation_file,
                               filename=csv_path))
                    return 2
            # If we found the original file, retrieve it from the DB
            original_file = File.objects.get(
                removedtime__isnull=True,
                filegrpuse="original",
                originallocation__endswith=original,
                sip_id=SIPUUID)
        else:
            if isinstance(e, File.DoesNotExist):
                job.print_error(
                    "No matching file for: ",
                    filePath.replace(SIPDirectory, "%SIPDirectory%", 1))
                return 3
            elif isinstance(e, File.MultipleObjectsReturned):
                job.print_error(
                    "Too many possible files for: ",
                    filePath.replace(SIPDirectory, "%SIPDirectory%", 1))
                return 2

    # We found the original file somewhere above
    job.print_output(
        "Matched original file %s (%s) to  preservation file %s (%s)" %
        (original_file.currentlocation, original_file.uuid, filePath,
         fileUUID))
    # Generate the new preservation path: path/to/original/filename-uuid.ext
    basename = os.path.basename(filePath)
    i = basename.rfind(".")
    dstFile = basename[:i] + "-" + fileUUID + basename[i:]
    dstDir = os.path.dirname(
        original_file.currentlocation.replace("%SIPDirectory%", SIPDirectory,
                                              1))
    dst = os.path.join(dstDir, dstFile)
    dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1)

    if os.path.exists(dst):
        job.print_error("already exists:", dstR)
        return 2

    # Rename the preservation file
    job.print_output('Renaming preservation file', filePath, 'to', dst)
    os.rename(filePath, dst)
    # Update the preservation file's location
    File.objects.filter(uuid=fileUUID).update(currentlocation=dstR)

    try:
        # Normalization event already exists, so just update it
        # fileUUID, eventIdentifierUUID, eventType, eventDateTime, eventDetail
        # probably already correct, and we only set eventOutcomeDetailNote here
        # Not using .filter().update() because that doesn't generate an exception
        e = Event.objects.get(event_type="normalization",
                              file_uuid=original_file)
        e.event_outcome_detail = dstR
        e.save()
        job.print_output(
            'Updated the eventOutcomeDetailNote of an existing normalization'
            ' Event for file {}. Not creating a Derivation object'.format(
                fileUUID))
    except Event.DoesNotExist:
        # No normalization event was created in normalize.py - probably manually
        # normalized during Ingest
        derivationEventUUID = str(uuid.uuid4())
        databaseFunctions.insertIntoEvents(
            fileUUID=original_file.uuid,
            eventIdentifierUUID=derivationEventUUID,
            eventType="normalization",
            eventDateTime=date,
            eventDetail="manual normalization",
            eventOutcome="",
            eventOutcomeDetailNote=dstR)
        job.print_output(
            'Created a manual normalization Event for file {}.'.format(
                original_file.uuid))

        # Add linking information between files
        # Assuming that if an event already exists, then the derivation does as well
        databaseFunctions.insertIntoDerivations(
            sourceFileUUID=original_file.uuid,
            derivedFileUUID=fileUUID,
            relatedEventUUID=derivationEventUUID)
        job.print_output(
            'Created a Derivation for original file {}, derived file {}, and'
            ' event {}'.format(original_file.uuid, fileUUID,
                               derivationEventUUID))

    return 0
dst = os.path.join(dstDir, dstFile)
dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1)

if os.path.isfile(dst) or os.path.isdir(dst):
    print >>sys.stderr, "already exists:", dstR
    exit(2)

#Rename the file or directory src to dst. If dst is a directory, OSError will be raised. On Unix, if dst exists and is a file, it will be replaced silently if the user has permission. The operation may fail on some Unix flavors if src and dst are on different filesystems.
#see http://docs.python.org/2/library/os.html
os.rename(filePath, dst)
sql =  """UPDATE Files SET currentLocation='%s' WHERE fileUUID='%s';""" % (dstR, fileUUID)
databaseInterface.runSQL(sql)

derivationEventUUID = uuid.uuid4().__str__()
databaseFunctions.insertIntoEvents(
    fileUUID=originalFileUUID,
    eventIdentifierUUID=derivationEventUUID,
    eventType="normalization",
    eventDateTime=date,
    eventDetail="manual normalization",
    eventOutcome="",
    eventOutcomeDetailNote=dstR)

#Add linking information between files
databaseFunctions.insertIntoDerivations(
    sourceFileUUID=originalFileUUID,
    derivedFileUUID=fileUUID,
    relatedEventUUID=derivationEventUUID)

exit(0)
示例#19
0
dstDir = os.path.dirname(
    originalFilePath.replace("%SIPDirectory%", SIPDirectory, 1))
dst = os.path.join(dstDir, dstFile)
dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1)

if os.path.isfile(dst) or os.path.isdir(dst):
    print >> sys.stderr, "already exists:", dstR
    exit(2)

#Rename the file or directory src to dst. If dst is a directory, OSError will be raised. On Unix, if dst exists and is a file, it will be replaced silently if the user has permission. The operation may fail on some Unix flavors if src and dst are on different filesystems.
#see http://docs.python.org/2/library/os.html
os.rename(filePath, dst)
sql = """UPDATE Files SET currentLocation='%s' WHERE fileUUID='%s';""" % (
    dstR, fileUUID)
databaseInterface.runSQL(sql)

derivationEventUUID = uuid.uuid4().__str__()
insertIntoEvents(fileUUID=originalFileUUID, \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=date, \
               eventDetail="manual normalization", \
               eventOutcome="", \
               eventOutcomeDetailNote=dstR)

#Add linking information between files
insertIntoDerivations(sourceFileUUID=originalFileUUID,
                      derivedFileUUID=fileUUID,
                      relatedEventUUID=derivationEventUUID)

exit(0)
def onceNormalized(command):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    elif os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >> sys.stderr, command
        print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    for ef in transcodedFiles:
        global outputFileUUID
        global replacementDic
        global opts
        if opts.commandClassifications == "preservation":
            old = """xmlNormalize(outputFileUUID, \
                     ef, \
                     command.eventDetailCommand.stdOut, \
                     opts.fileUUID, \
                     opts.objectsDirectory, \
                     opts.taskUUID, \
                     opts.date, \
                     opts.logsDirectory, \
                     ) #    {normalized; not normalized}"""

            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%",
                                               1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP,
                         outputFileUUID,
                         opts.sipUUID,
                         uuid.uuid4().__str__(),
                         opts.date,
                         sourceType="creation",
                         use="preservation")
            #Calculate new file checksum
            print >> sys.stderr, "TODO: calculate new file checksum"
            #Add event information to current file
            insertIntoEvents(fileUUID=opts.fileUUID, \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts.date, \
               eventDetail=command.eventDetailCommand.stdOut, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(outputFileUUID, ef, opts.date,
                                  uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(sourceFileUUID=opts.fileUUID,
                                  derivedFileUUID=outputFileUUID,
                                  relatedEventUUID=derivationEventUUID)

            outputFileUUID = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + outputFileUUID