def insert_derivation_event(original_uuid, output_uuid, derivation_uuid, event_detail_output, outcome_detail_note, today=None): """ Add the derivation link for preservation files and the event. """ if today is None: today = timezone.now() # Add event information to current file databaseFunctions.insertIntoEvents( fileUUID=original_uuid, eventIdentifierUUID=derivation_uuid, eventType="normalization", eventDateTime=today, eventDetail=event_detail_output, eventOutcome="", eventOutcomeDetailNote=outcome_detail_note or "", ) # Add linking information between files databaseFunctions.insertIntoDerivations( sourceFileUUID=original_uuid, derivedFileUUID=output_uuid, relatedEventUUID=derivation_uuid, )
def main(job, shared_path, file_uuid, file_path, date, event_uuid): try: file_ = File.objects.get(uuid=file_uuid) except File.DoesNotExist: logger.exception("File with UUID %s cannot be found.", file_uuid) return 1 # See if it's a Transfer and in particular a Archivematica AIP transfer. # If so, try to extract the size, checksum and checksum function from the # original METS document. kw = {} if (file_.transfer and (not file_.sip) and file_.transfer.type == "Archivematica AIP"): info = get_file_info_from_mets(job, shared_path, file_) kw.update( fileSize=info["file_size"], checksum=info["checksum"], checksumType=info["checksum_type"], add_event=False, ) if info.get("derivation"): insertIntoDerivations(sourceFileUUID=file_uuid, derivedFileUUID=info["derivation"]) if info.get("format_version"): FileFormatVersion.objects.create( file_uuid_id=file_uuid, format_version=info["format_version"]) updateSizeAndChecksum(file_uuid, file_path, date, event_uuid, **kw) return 0
def insert_file_into_database(file_uuid, sip_uuid, event_uuid, rule, output_path, relative_path): transcription_uuid = str(uuid4()) today = timezone.now() fileOperations.addFileToSIP( relative_path, transcription_uuid, sip_uuid, task_uuid, today, sourceType="creation", use="text/ocr" ) fileOperations.updateSizeAndChecksum( transcription_uuid, output_path, today, str(uuid4()) ) databaseFunctions.insertIntoDerivations( sourceFileUUID=file_uuid, derivedFileUUID=transcription_uuid, relatedEventUUID=event_uuid )
def create_db_entries(job, mapping, dataverse_agent_id): """ Create derivation event and derivative entries for the tabular bundle data in the transfer. """ for entry, file_entry in mapping.items(): if entry.derived_from and entry.use == "derivative": original_uuid = mapping[entry.derived_from].uuid event_uuid = uuid.uuid4() try: databaseFunctions.insertIntoEvents( original_uuid, eventIdentifierUUID=event_uuid, eventType="derivation", eventDateTime=None, eventDetail="", eventOutcome="", eventOutcomeDetailNote=file_entry.currentlocation, agents=[dataverse_agent_id], ) # Add derivation databaseFunctions.insertIntoDerivations( sourceFileUUID=original_uuid, derivedFileUUID=file_entry.uuid, relatedEventUUID=event_uuid, ) job.pyprint( "Added derivation from", original_uuid, "to", file_entry.uuid ) except django.db.IntegrityError: err_log = "Database integrity error, entry: {} for file {}".format( file_entry.currentlocation, file_entry.originallocation ) raise ParseDataverseError(err_log)
def xmlCreateFileAssociationBetween(originalFileFullPath, outputFromNormalizationFileFullPath, SIPFullPath, sipUUID, eventDetailText, eventOutcomeDetailNote, outputFileUUID=""): #assign file UUID date = databaseInterface.getUTCDate() if outputFileUUID == "": outputFileUUID = uuid.uuid4().__str__() originalFilePathRelativeToSIP = originalFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1) sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string(originalFilePathRelativeToSIP) + "' AND Files.sipUUID = '" + sipUUID + "';" print sql rows = databaseInterface.queryAllSQL(sql) print rows fileUUID = rows[0][0] filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1) addFileToSIP(filePathRelativeToSIP, outputFileUUID, sipUUID, uuid.uuid4().__str__(), date, sourceType="creation", use="preservation") updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath, date, uuid.uuid4().__str__()) taskUUID = uuid.uuid4().__str__() insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=taskUUID, \ eventType="normalization", \ eventDateTime=date, \ eventDetail=eventDetailText, \ eventOutcome="", \ eventOutcomeDetailNote=eventOutcomeDetailNote) insertIntoDerivations(sourceFileUUID=fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=taskUUID)
def create_db_entries(job, mapping, dataverse_agent_id): """ Create event and derivatives entries for the derived tabular data in the database. """ for entry, file_entry in mapping.items(): if entry.derived_from and entry.use == 'derivative': original_uuid = mapping[entry.derived_from].uuid event_uuid = uuid.uuid4() # Add event databaseFunctions.insertIntoEvents( original_uuid, eventIdentifierUUID=event_uuid, eventType="derivation", eventDateTime=None, # From Dataverse? eventDetail="", # From Dataverse? eventOutcome="", # From Dataverse? eventOutcomeDetailNote=file_entry.currentlocation, agents=[dataverse_agent_id], ) # Add derivation databaseFunctions.insertIntoDerivations( sourceFileUUID=original_uuid, derivedFileUUID=file_entry.uuid, relatedEventUUID=event_uuid, ) job.pyprint( 'Added derivation from', original_uuid, 'to', file_entry.uuid)
def onceNormalized(command, opts, replacementDic): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" if os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >> sys.stderr, command print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() eventDetail = "" if command.eventDetailCommand != None: eventDetail = eventDetail = command.eventDetailCommand.stdOut for ef in transcodedFiles: if opts["commandClassifications"] == "preservation": # Add the new file to the sip filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP( filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation", ) # Calculate new file checksum # Add event information to current file insertIntoEvents( fileUUID=opts["fileUUID"], eventIdentifierUUID=derivationEventUUID, eventType="normalization", eventDateTime=opts["date"], eventDetail=eventDetail, eventOutcome="", eventOutcomeDetailNote=filePathRelativeToSIP, ) updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__()) # Add linking information between files insertIntoDerivations( sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID, ) replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def onceNormalized(command): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" elif os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >>sys.stderr, command print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() for ef in transcodedFiles: global outputFileUUID global replacementDic global opts if opts.commandClassifications == "preservation": old = """xmlNormalize(outputFileUUID, \ ef, \ command.eventDetailCommand.stdOut, \ opts.fileUUID, \ opts.objectsDirectory, \ opts.taskUUID, \ opts.date, \ opts.logsDirectory, \ ) # {normalized; not normalized}""" #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, outputFileUUID, opts.sipUUID, uuid.uuid4().__str__(), opts.date, sourceType="creation", use="preservation") #Calculate new file checksum print >>sys.stderr, "TODO: calculate new file checksum" #Add event information to current file insertIntoEvents(fileUUID=opts.fileUUID, \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts.date, \ eventDetail=command.eventDetailCommand.stdOut, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(outputFileUUID, ef, opts.date, uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations(sourceFileUUID=opts.fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=derivationEventUUID) outputFileUUID = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + outputFileUUID
def onceNormalized(command, opts, replacementDic): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" if os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >>sys.stderr, command print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk) if command.eventDetailCommand != None: eventDetail = '%s; %s' % (eventDetail, command.eventDetailCommand.stdOut) for ef in transcodedFiles: if opts["commandClassifications"] == "preservation": # TODO Add manual normalization for files of same name mapping #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation") #Calculate new file checksum #Add event information to current file insertIntoEvents(fileUUID=opts["fileUUID"], \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts["date"], \ eventDetail=eventDetail, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations(sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID) sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % (replacementDic["%outputFileUUID%"], command.outputFormat) databaseInterface.runSQL(sql) replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def xmlCreateFileAssociationBetween(originalFileFullPath, outputFromNormalizationFileFullPath, SIPFullPath, sipUUID, eventDetailText, eventOutcomeDetailNote, outputFileUUID=""): #assign file UUID date = databaseInterface.getUTCDate() if outputFileUUID == "": outputFileUUID = uuid.uuid4().__str__() originalFilePathRelativeToSIP = originalFileFullPath.replace( SIPFullPath, "%SIPDirectory%", 1) sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string( originalFilePathRelativeToSIP ) + "' AND Files.sipUUID = '" + sipUUID + "';" print sql rows = databaseInterface.queryAllSQL(sql) print rows fileUUID = rows[0][0] filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace( SIPFullPath, "%SIPDirectory%", 1) addFileToSIP(filePathRelativeToSIP, outputFileUUID, sipUUID, uuid.uuid4().__str__(), date, sourceType="creation", use="preservation") updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath, date, uuid.uuid4().__str__()) taskUUID = uuid.uuid4().__str__() insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=taskUUID, \ eventType="normalization", \ eventDateTime=date, \ eventDetail=eventDetailText, \ eventOutcome="", \ eventOutcomeDetailNote=eventOutcomeDetailNote) insertIntoDerivations(sourceFileUUID=fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=taskUUID)
def update_files(sip_uuid, files): """ Update file information to DB. :param sip_uuid: UUID of the SIP to parse the metadata for. :param files: List of dicts containing file info. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") # Add information to the DB for file_info in files: # Add file & reingest event event_id = str(uuid.uuid4()) fileOperations.addFileToSIP( filePathRelativeToSIP=file_info['original_path'], fileUUID=file_info['uuid'], sipUUID=sip_uuid, taskUUID=event_id, date=now, sourceType="reingestion", use=file_info['use'], ) # Update other file info # This doesn't use updateSizeAndChecksum because it also updates currentlocation models.File.objects.filter(uuid=file_info['uuid']).update( checksum=file_info['checksum'], checksumtype=file_info['checksumtype'], size=file_info['size'], currentlocation=file_info['current_path'] ) if file_info['format_version']: # Add Format ID models.FileFormatVersion.objects.create( file_uuid_id=file_info['uuid'], format_version=file_info['format_version'] ) # Derivation info # Has to be separate loop, as derived file may not be in DB otherwise # May not need to be parsed, if Derivation info can be roundtripped in METS Reader/Writer for file_info in files: if file_info['derivation'] is None: continue databaseFunctions.insertIntoDerivations( sourceFileUUID=file_info['uuid'], derivedFileUUID=file_info['derivation'], )
def create_mets_file(aic, aips): """ Create AIC METS file with AIP information. """ # Prepare constants nsmap = { 'mets': ns.metsNS, 'xlink': ns.xlinkNS, 'xsi': ns.xsiNS, } now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") # Set up structure E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap) mets = ( E.mets( E.metsHdr(CREATEDATE=now), E.dmdSec( E.mdWrap( E.xmlData(), MDTYPE="DC", # mdWrap ), ID='dmdSec_1', # dmdSec ), E.fileSec(E.fileGrp(), ), E.structMap( E.div( TYPE="Archival Information Collection", DMDID="dmdSec_1", ), TYPE='logical', # structMap ), )) mets.attrib['{{{ns}}}schemaLocation'.format( ns=nsmap['xsi'] )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd" # Add Dublin Core info xml_data = mets.find('mets:dmdSec/mets:mdWrap/mets:xmlData', namespaces=ns.NSMAP) dublincore = archivematicaCreateMETS2.getDublinCore( archivematicaCreateMETS2.SIPMetadataAppliesToType, aic['uuid']) # Add <extent> with number of AIPs extent = etree.SubElement(dublincore, ns.dctermsBNS + 'extent') extent.text = "{} AIPs".format(len(aips)) xml_data.append(dublincore) # Add elements for each AIP file_grp = mets.find('mets:fileSec/mets:fileGrp', namespaces=ns.NSMAP) struct_div = mets.find('mets:structMap/mets:div', namespaces=ns.NSMAP) for aip in aips: file_id = '{name}-{uuid}'.format(name=aip['name'], uuid=aip['uuid']) etree.SubElement(file_grp, ns.metsBNS + 'file', ID=file_id) label = aip['label'] or aip['name'] div = etree.SubElement(struct_div, ns.metsBNS + 'div', LABEL=label) etree.SubElement(div, ns.metsBNS + 'fptr', FILEID=file_id) print etree.tostring(mets, pretty_print=True) # Write out the file file_uuid = str(uuid.uuid4()) basename = os.path.join('metadata', "METS.{}.xml".format(file_uuid)) filename = os.path.join(aic['dir'], basename) with open(filename, 'w') as f: f.write(etree.tostring(mets, pretty_print=True)) fileOperations.addFileToSIP( filePathRelativeToSIP='%SIPDirectory%' + basename, fileUUID=file_uuid, sipUUID=aic['uuid'], taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", use='metadata') # To make this work with the createMETS2 (for SIPs) databaseFunctions.insertIntoDerivations(file_uuid, file_uuid) # Insert the count of AIPs in the AIC into UnitVariables, so it can be # indexed later UnitVariable.objects.create(unittype="SIP", unituuid=aic['uuid'], variable="AIPsinAIC", variablevalue=str(len(aips)))
def onceNormalized(command, opts, replacementDic): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" if os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >> sys.stderr, command print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk) if command.eventDetailCommand != None: eventDetail = '%s; %s' % (eventDetail, command.eventDetailCommand.stdOut) for ef in transcodedFiles: if opts["commandClassifications"] == "preservation": # TODO Add manual normalization for files of same name mapping #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation") #Calculate new file checksum #Add event information to current file insertIntoEvents(fileUUID=opts["fileUUID"], \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts["date"], \ eventDetail=eventDetail, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations( sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID) sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % ( replacementDic["%outputFileUUID%"], command.outputFormat) databaseInterface.runSQL(sql) replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__() replacementDic[ "%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def create_mets_file(aic, aips, job): """ Create AIC METS file with AIP information. """ # Prepare constants nsmap = {"mets": ns.metsNS, "xlink": ns.xlinkNS, "xsi": ns.xsiNS} now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") # Set up structure E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap) mets = E.mets( E.metsHdr(CREATEDATE=now), E.dmdSec(E.mdWrap(E.xmlData(), MDTYPE="DC"), ID="dmdSec_1"), # mdWrap # dmdSec E.fileSec(E.fileGrp()), E.structMap( E.div(TYPE="Archival Information Collection", DMDID="dmdSec_1"), TYPE="logical", # structMap ), ) mets.attrib["{{{ns}}}schemaLocation".format( ns=nsmap["xsi"] )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd" # Add Dublin Core info xml_data = mets.find("mets:dmdSec/mets:mdWrap/mets:xmlData", namespaces=ns.NSMAP) dublincore = create_mets_v2.getDublinCore( create_mets_v2.SIPMetadataAppliesToType, aic["uuid"]) # Add <extent> with number of AIPs extent = etree.SubElement(dublincore, ns.dctermsBNS + "extent") extent.text = "{} AIPs".format(len(aips)) xml_data.append(dublincore) # Add elements for each AIP file_grp = mets.find("mets:fileSec/mets:fileGrp", namespaces=ns.NSMAP) struct_div = mets.find("mets:structMap/mets:div", namespaces=ns.NSMAP) for aip in aips: file_id = "{name}-{uuid}".format(name=aip["name"], uuid=aip["uuid"]) etree.SubElement(file_grp, ns.metsBNS + "file", ID=file_id) label = aip["label"] or aip["name"] div = etree.SubElement(struct_div, ns.metsBNS + "div", LABEL=label) etree.SubElement(div, ns.metsBNS + "fptr", FILEID=file_id) job.pyprint(etree.tostring(mets, pretty_print=True)) # Write out the file file_uuid = str(uuid.uuid4()) basename = os.path.join("metadata", "METS.{}.xml".format(file_uuid)) filename = os.path.join(aic["dir"], basename) with open(filename, "w") as f: f.write( etree.tostring(mets, pretty_print=True, xml_declaration=True, encoding="utf-8")) fileOperations.addFileToSIP( filePathRelativeToSIP="%SIPDirectory%" + basename, fileUUID=file_uuid, sipUUID=aic["uuid"], taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", use="metadata", ) # To make this work with the createMETS2 (for SIPs) databaseFunctions.insertIntoDerivations(file_uuid, file_uuid) # Insert the count of AIPs in the AIC into UnitVariables, so it can be # indexed later UnitVariable.objects.create( unittype="SIP", unituuid=aic["uuid"], variable="AIPsinAIC", variablevalue=str(len(aips)), )
e.event_outcome_detail = dstR e.save() print('Updated the eventOutcomeDetailNote of an existing normalization' ' Event for file {}. Not creating a Derivation object'.format( fileUUID)) except Event.DoesNotExist: # No normalization event was created in normalize.py - probably manually # normalized during Ingest derivationEventUUID = str(uuid.uuid4()) databaseFunctions.insertIntoEvents(fileUUID=original_file.uuid, eventIdentifierUUID=derivationEventUUID, eventType="normalization", eventDateTime=date, eventDetail="manual normalization", eventOutcome="", eventOutcomeDetailNote=dstR) print('Created a manual normalization Event for file {}.'.format( original_file.uuid)) # Add linking information between files # Assuming that if an event already exists, then the derivation does as well databaseFunctions.insertIntoDerivations( sourceFileUUID=original_file.uuid, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID) print('Created a Derivation for original file {}, derived file {}, and' ' event {}'.format(original_file.uuid, fileUUID, derivationEventUUID)) exit(0)
basename = os.path.basename(filePath) i = basename.rfind(".") dstFile = basename[:i] + "-" + fileUUID + basename[i:] dstDir = os.path.dirname(originalFilePath.replace("%SIPDirectory%", SIPDirectory, 1)) dst = os.path.join(dstDir, dstFile) dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1) if os.path.isfile(dst) or os.path.isdir(dst): print >>sys.stderr, "already exists:", dstR exit(2) #Rename the file or directory src to dst. If dst is a directory, OSError will be raised. On Unix, if dst exists and is a file, it will be replaced silently if the user has permission. The operation may fail on some Unix flavors if src and dst are on different filesystems. #see http://docs.python.org/2/library/os.html os.rename(filePath, dst) sql = """UPDATE Files SET currentLocation='%s' WHERE fileUUID='%s';""" % (dstR, fileUUID) databaseInterface.runSQL(sql) derivationEventUUID = uuid.uuid4().__str__() insertIntoEvents(fileUUID=originalFileUUID, \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=date, \ eventDetail="manual normalization", \ eventOutcome="", \ eventOutcomeDetailNote=dstR) #Add linking information between files insertIntoDerivations(sourceFileUUID=originalFileUUID, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID) exit(0)
def main(job): # "%SIPUUID%" "%SIPName%" "%SIPDirectory%" "%fileUUID%" "%filePath%" # job.args[2] (SIPName) is unused. SIPUUID = job.args[1] SIPDirectory = job.args[3] fileUUID = job.args[4] filePath = job.args[5] date = job.args[6] # Search for original file associated with preservation file given in filePath filePathLike = filePath.replace( os.path.join(SIPDirectory, "objects", "manualNormalization", "preservation"), "%SIPDirectory%objects", 1) i = filePathLike.rfind(".") k = os.path.basename(filePath).rfind(".") if i != -1 and k != -1: filePathLike = filePathLike[:i + 1] # Matches "path/to/file/filename." Includes . so it doesn't false match foobar.txt when we wanted foo.txt filePathLike1 = filePathLike # Matches the exact filename. For files with no extension. filePathLike2 = filePathLike[:-1] try: path_condition = Q(currentlocation__startswith=filePathLike1) | Q( currentlocation=filePathLike2) original_file = File.objects.get(path_condition, removedtime__isnull=True, filegrpuse="original", sip_id=SIPUUID) except (File.DoesNotExist, File.MultipleObjectsReturned) as e: # Original file was not found, or there is more than one original file with # the same filename (differing extensions) # Look for a CSV that will specify the mapping csv_path = os.path.join(SIPDirectory, "objects", "manualNormalization", "normalization.csv") if os.path.isfile(csv_path): try: preservation_file = filePath[ filePath.index('manualNormalization/preservation/'):] except ValueError: job.print_error( "{0} not in manualNormalization directory".format( filePath)) return 4 original = fileOperations.findFileInNormalizationCSV( csv_path, "preservation", preservation_file, SIPUUID, printfn=job.pyprint) if original is None: if isinstance(e, File.DoesNotExist): job.print_error("No matching file for: {0}".format( filePath.replace(SIPDirectory, "%SIPDirectory%"))) return 3 else: job.print_error( "Could not find {preservation_file} in {filename}". format(preservation_file=preservation_file, filename=csv_path)) return 2 # If we found the original file, retrieve it from the DB original_file = File.objects.get( removedtime__isnull=True, filegrpuse="original", originallocation__endswith=original, sip_id=SIPUUID) else: if isinstance(e, File.DoesNotExist): job.print_error( "No matching file for: ", filePath.replace(SIPDirectory, "%SIPDirectory%", 1)) return 3 elif isinstance(e, File.MultipleObjectsReturned): job.print_error( "Too many possible files for: ", filePath.replace(SIPDirectory, "%SIPDirectory%", 1)) return 2 # We found the original file somewhere above job.print_output( "Matched original file %s (%s) to preservation file %s (%s)" % (original_file.currentlocation, original_file.uuid, filePath, fileUUID)) # Generate the new preservation path: path/to/original/filename-uuid.ext basename = os.path.basename(filePath) i = basename.rfind(".") dstFile = basename[:i] + "-" + fileUUID + basename[i:] dstDir = os.path.dirname( original_file.currentlocation.replace("%SIPDirectory%", SIPDirectory, 1)) dst = os.path.join(dstDir, dstFile) dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1) if os.path.exists(dst): job.print_error("already exists:", dstR) return 2 # Rename the preservation file job.print_output('Renaming preservation file', filePath, 'to', dst) os.rename(filePath, dst) # Update the preservation file's location File.objects.filter(uuid=fileUUID).update(currentlocation=dstR) try: # Normalization event already exists, so just update it # fileUUID, eventIdentifierUUID, eventType, eventDateTime, eventDetail # probably already correct, and we only set eventOutcomeDetailNote here # Not using .filter().update() because that doesn't generate an exception e = Event.objects.get(event_type="normalization", file_uuid=original_file) e.event_outcome_detail = dstR e.save() job.print_output( 'Updated the eventOutcomeDetailNote of an existing normalization' ' Event for file {}. Not creating a Derivation object'.format( fileUUID)) except Event.DoesNotExist: # No normalization event was created in normalize.py - probably manually # normalized during Ingest derivationEventUUID = str(uuid.uuid4()) databaseFunctions.insertIntoEvents( fileUUID=original_file.uuid, eventIdentifierUUID=derivationEventUUID, eventType="normalization", eventDateTime=date, eventDetail="manual normalization", eventOutcome="", eventOutcomeDetailNote=dstR) job.print_output( 'Created a manual normalization Event for file {}.'.format( original_file.uuid)) # Add linking information between files # Assuming that if an event already exists, then the derivation does as well databaseFunctions.insertIntoDerivations( sourceFileUUID=original_file.uuid, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID) job.print_output( 'Created a Derivation for original file {}, derived file {}, and' ' event {}'.format(original_file.uuid, fileUUID, derivationEventUUID)) return 0
dst = os.path.join(dstDir, dstFile) dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1) if os.path.isfile(dst) or os.path.isdir(dst): print >>sys.stderr, "already exists:", dstR exit(2) #Rename the file or directory src to dst. If dst is a directory, OSError will be raised. On Unix, if dst exists and is a file, it will be replaced silently if the user has permission. The operation may fail on some Unix flavors if src and dst are on different filesystems. #see http://docs.python.org/2/library/os.html os.rename(filePath, dst) sql = """UPDATE Files SET currentLocation='%s' WHERE fileUUID='%s';""" % (dstR, fileUUID) databaseInterface.runSQL(sql) derivationEventUUID = uuid.uuid4().__str__() databaseFunctions.insertIntoEvents( fileUUID=originalFileUUID, eventIdentifierUUID=derivationEventUUID, eventType="normalization", eventDateTime=date, eventDetail="manual normalization", eventOutcome="", eventOutcomeDetailNote=dstR) #Add linking information between files databaseFunctions.insertIntoDerivations( sourceFileUUID=originalFileUUID, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID) exit(0)
dstDir = os.path.dirname( originalFilePath.replace("%SIPDirectory%", SIPDirectory, 1)) dst = os.path.join(dstDir, dstFile) dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1) if os.path.isfile(dst) or os.path.isdir(dst): print >> sys.stderr, "already exists:", dstR exit(2) #Rename the file or directory src to dst. If dst is a directory, OSError will be raised. On Unix, if dst exists and is a file, it will be replaced silently if the user has permission. The operation may fail on some Unix flavors if src and dst are on different filesystems. #see http://docs.python.org/2/library/os.html os.rename(filePath, dst) sql = """UPDATE Files SET currentLocation='%s' WHERE fileUUID='%s';""" % ( dstR, fileUUID) databaseInterface.runSQL(sql) derivationEventUUID = uuid.uuid4().__str__() insertIntoEvents(fileUUID=originalFileUUID, \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=date, \ eventDetail="manual normalization", \ eventOutcome="", \ eventOutcomeDetailNote=dstR) #Add linking information between files insertIntoDerivations(sourceFileUUID=originalFileUUID, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID) exit(0)
def onceNormalized(command): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" elif os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >> sys.stderr, command print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() for ef in transcodedFiles: global outputFileUUID global replacementDic global opts if opts.commandClassifications == "preservation": old = """xmlNormalize(outputFileUUID, \ ef, \ command.eventDetailCommand.stdOut, \ opts.fileUUID, \ opts.objectsDirectory, \ opts.taskUUID, \ opts.date, \ opts.logsDirectory, \ ) # {normalized; not normalized}""" #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, outputFileUUID, opts.sipUUID, uuid.uuid4().__str__(), opts.date, sourceType="creation", use="preservation") #Calculate new file checksum print >> sys.stderr, "TODO: calculate new file checksum" #Add event information to current file insertIntoEvents(fileUUID=opts.fileUUID, \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts.date, \ eventDetail=command.eventDetailCommand.stdOut, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(outputFileUUID, ef, opts.date, uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations(sourceFileUUID=opts.fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=derivationEventUUID) outputFileUUID = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + outputFileUUID