def verifyChecksum(fileUUID, filePath, date, eventIdentifierUUID): f = File.objects.get(uuid=fileUUID) if f.checksum in ("", "None"): print >> sys.stderr, "No checksum found in database for file:", fileUUID, filePath exit(1) checksumFile = sha_for_file(filePath) eventOutcome = "" eventOutcomeDetailNote = "" exitCode = 0 if checksumFile != f.checksum: eventOutcomeDetailNote = str(checksumFile) + " != " + f.checksum eventOutcome = "Fail" exitCode = 2 print >> sys.stderr, "Checksums do not match:", fileUUID, filePath print >> sys.stderr, eventOutcomeDetailNote else: eventOutcomeDetailNote = "%s %s" % (str(checksumFile), "verified") eventOutcome = "Pass" exitCode = 0 databaseFunctions.insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType="fixity check", \ eventDateTime=date, \ eventOutcome=eventOutcome, \ eventOutcomeDetailNote=eventOutcomeDetailNote, \ eventDetail="program=\"python\"; module=\"hashlib.sha256()\"") exit(exitCode)
def verifyMetsFileSecChecksums(metsFile, date, taskUUID, relativeDirectory="./"): print metsFile exitCode = 0 tree = etree.parse(metsFile) root = tree.getroot() for item in root.findall( "{http://www.loc.gov/METS/}fileSec/{http://www.loc.gov/METS/}fileGrp/{http://www.loc.gov/METS/}file" ): #print etree.tostring(item) #print item checksum = item.get("CHECKSUM") checksumType = item.get("CHECKSUMTYPE") for item2 in item: if item2.tag == "{http://www.loc.gov/METS/}FLocat": #print "floc: ", item2.tag, etree.tostring(item2) #print item2.attrib fileLocation = item2.get("{http://www.w3.org/1999/xlink}href") #print "%s - %s - %s " % (checksumType, checksum, fileLocation) fileFullPath = os.path.join(relativeDirectory, fileLocation) if checksumType == "MD5": checksum2 = md5_for_file(fileFullPath) eventDetail = "program=\"python\"; module=\"hashlib.sha256()\"" elif checksumType == "sha256": checksum2 = sha_for_file(fileFullPath) eventDetail = "program=\"python\"; module=\"hashlib.md5()\"" else: print >> sys.stderr, "Unsupported checksum type: %s" % ( checksumType.__str__()) exit(300) if checksum != checksum2: #eventOutcomeDetailNote = checksumFile.__str__() + " != " + checksumDB.__str__() eventOutcome = "Fail" print "%s - %s - %s" % ((checksum == checksum2).__str__(), checksum.__str__(), checksum2.__str__()) print >> sys.stderr, eventOutcome, fileFullPath exitCode = exitCode + 22 else: #eventOutcomeDetailNote = checksumFile.__str__() + "verified" eventOutcome = "Pass" print eventOutcome, fileLocation return exitCode #insertIntoEvents(fileUUID="", eventIdentifierUUID="", eventType="", eventDateTime=databaseInterface.getUTCDate(), eventDetail="", eventOutcome="", eventOutcomeDetailNote="") databaseFunctions.insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType="fixity check", \ eventDateTime=date, \ eventOutcome=eventOutcome, \ eventOutcomeDetailNote=eventOutcomeDetailNote, \ eventDetail=eventDetail)
def verifyMetsFileSecChecksums(metsFile, date, taskUUID, relativeDirectory="./"): print metsFile exitCode = 0 tree = etree.parse(metsFile) root = tree.getroot() for item in root.findall("{http://www.loc.gov/METS/}fileSec/{http://www.loc.gov/METS/}fileGrp/{http://www.loc.gov/METS/}file"): #print etree.tostring(item) #print item checksum = item.get("CHECKSUM") checksumType = item.get("CHECKSUMTYPE") for item2 in item: if item2.tag == "{http://www.loc.gov/METS/}FLocat": #print "floc: ", item2.tag, etree.tostring(item2) #print item2.attrib fileLocation = item2.get("{http://www.w3.org/1999/xlink}href") #print "%s - %s - %s " % (checksumType, checksum, fileLocation) fileFullPath = os.path.join(relativeDirectory, fileLocation) if checksumType == "MD5": checksum2 = md5_for_file(fileFullPath) eventDetail = "program=\"python\"; module=\"hashlib.sha256()\"" elif checksumType == "sha256": checksum2 = sha_for_file(fileFullPath) eventDetail = "program=\"python\"; module=\"hashlib.md5()\"" else: print >>sys.stderr, "Unsupported checksum type: %s" % (checksumType.__str__()) exit(300) if checksum != checksum2: #eventOutcomeDetailNote = checksumFile.__str__() + " != " + checksumDB.__str__() eventOutcome="Fail" print "%s - %s - %s" % ((checksum == checksum2).__str__(), checksum.__str__(), checksum2.__str__()) print >>sys.stderr, eventOutcome, fileFullPath exitCode = exitCode + 22 else: #eventOutcomeDetailNote = checksumFile.__str__() + "verified" eventOutcome="Pass" print eventOutcome, fileLocation return exitCode #insertIntoEvents(fileUUID="", eventIdentifierUUID="", eventType="", eventDateTime=databaseInterface.getUTCDate(), eventDetail="", eventOutcome="", eventOutcomeDetailNote="") databaseFunctions.insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType="fixity check", \ eventDateTime=date, \ eventOutcome=eventOutcome, \ eventOutcomeDetailNote=eventOutcomeDetailNote, \ eventDetail=eventDetail)
def updateSizeAndChecksum(fileUUID, filePath, date, eventIdentifierUUID): fileSize = os.path.getsize(filePath) checksum = str(sha_for_file(filePath)) File.objects.filter(uuid=fileUUID).update(size=fileSize, checksum=checksum) insertIntoEvents(fileUUID=fileUUID, \ eventType="message digest calculation", \ eventDateTime=date, \ eventDetail="program=\"python\"; module=\"hashlib.sha256()\"", \ eventOutcomeDetailNote=checksum)
def updateSizeAndChecksum(fileUUID, filePath, date, eventIdentifierUUID): fileSize = os.path.getsize(filePath).__str__() checksum = sha_for_file(filePath).__str__() sql = "UPDATE Files " + \ "SET fileSize='" + fileSize +"', checksum='" + checksum + "' " + \ "WHERE fileUUID='" + fileUUID + "'" databaseInterface.runSQL(sql) insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType="message digest calculation", \ eventDateTime=date, \ eventDetail="program=\"python\"; module=\"hashlib.sha256()\"", \ eventOutcomeDetailNote=checksum)
def checksumFile(filePath, fileUUID): global transferDirectory truePath = filePath.replace("transfer/", transferDirectory, 1) checksum = sha_for_file(truePath) utcDate = databaseInterface.getUTCDate() #Create Event eventIdentifierUUID = uuid.uuid4().__str__() eventType = "message digest calculation" eventDateTime = utcDate eventDetail = 'program="python"; module="hashlib.sha256()" ; file="/usr/lib/python2.6/hashlib.pyc"' eventOutcome = "" eventOutcomeDetailNote = checksum.__str__() databaseInterface.insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType=eventType, \ eventDateTime=eventDateTime, \ eventDetail=eventDetail, \ eventOutcome=eventOutcome, \ eventOutcomeDetailNote=eventOutcomeDetailNote)
def verifyChecksum(fileUUID, filePath, date, eventIdentifierUUID): sql = """SELECT checksum FROM Files WHERE fileUUID = '""" + fileUUID + "'" c, sqlLock = databaseInterface.querySQL(sql) row = c.fetchone() checksumDB = "" while row != None: checksumDB = row[0] row = c.fetchone() sqlLock.release() if checksumDB == None or checksumDB == "" or checksumDB == "None": print >> sys.stderr, "No checksum found in database for file:", fileUUID, filePath exit(1) checksumFile = sha_for_file(filePath) eventOutcome = "" eventOutcomeDetailNote = "" exitCode = 0 if checksumFile != checksumDB: eventOutcomeDetailNote = checksumFile.__str__( ) + " != " + checksumDB.__str__() eventOutcome = "Fail" exitCode = 2 print >> sys.stderr, "Checksums do not match:", fileUUID, filePath print >> sys.stderr, eventOutcomeDetailNote else: eventOutcomeDetailNote = "%s %s" % (checksumFile.__str__(), "verified") eventOutcome = "Pass" exitCode = 0 #insertIntoEvents(fileUUID="", eventIdentifierUUID="", eventType="", eventDateTime=databaseInterface.getUTCDate(), eventDetail="", eventOutcome="", eventOutcomeDetailNote="") databaseFunctions.insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType="fixity check", \ eventDateTime=date, \ eventOutcome=eventOutcome, \ eventOutcomeDetailNote=eventOutcomeDetailNote, \ eventDetail="program=\"python\"; module=\"hashlib.sha256()\"") exit(exitCode)
def verifyChecksum(fileUUID, filePath, date, eventIdentifierUUID): sql = """SELECT checksum FROM Files WHERE fileUUID = '""" + fileUUID + "'" c, sqlLock = databaseInterface.querySQL(sql) row = c.fetchone() checksumDB = "" while row != None: checksumDB = row[0] row = c.fetchone() sqlLock.release() if checksumDB == None or checksumDB == "" or checksumDB == "None": print >>sys.stderr, "No checksum found in database for file:", fileUUID, filePath exit(1) checksumFile = sha_for_file(filePath) eventOutcome="" eventOutcomeDetailNote="" exitCode = 0 if checksumFile != checksumDB: eventOutcomeDetailNote = checksumFile.__str__() + " != " + checksumDB.__str__() eventOutcome="Fail" exitCode = 2 print >>sys.stderr, "Checksums do not match:", fileUUID, filePath print >>sys.stderr, eventOutcomeDetailNote else: eventOutcomeDetailNote = "%s %s" % (checksumFile.__str__(), "verified") eventOutcome="Pass" exitCode = 0 #insertIntoEvents(fileUUID="", eventIdentifierUUID="", eventType="", eventDateTime=databaseInterface.getUTCDate(), eventDetail="", eventOutcome="", eventOutcomeDetailNote="") databaseFunctions.insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=eventIdentifierUUID, \ eventType="fixity check", \ eventDateTime=date, \ eventOutcome=eventOutcome, \ eventOutcomeDetailNote=eventOutcomeDetailNote, \ eventDetail="program=\"python\"; module=\"hashlib.sha256()\"") exit(exitCode)
def main(aip_uuid, aip_name, compression, sip_dir, aip_filename): # Prep work mets_schema_location = 'http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd' premis_schema_location = 'info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-2.xsd' # Datetime format string from http://docs.python.org/2/library/datetime.html # %Y = 4 digit year, %m = 2 digit month, %d = 2 digit day # %H = 24-hour hour, %M = 2-digit minute, %S = 2 digit second now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") aip_identifier = aip_name + '-' + aip_uuid aip_path = os.path.join(sip_dir, aip_filename) # Get archive tool and version program, algorithm = compression.split('-') # Pointer files are not written for uncompressed AIPs; # the purpose of the pointer file is primarily to provide information # on how to read a compressed AIP file, so there isn't anything for # it to do when pointing at an uncompressed AIP. if program == 'None': return 0 if program == '7z': archive_tool = '7-Zip' archive_tool_version = '9.20' # TODO get this dynamically elif program == 'pbzip2': archive_tool = program archive_tool_version = '1.1.6' # TODO get this dynamically # Format / file extension _, extension = os.path.splitext(aip_filename) # PRONOM ID and PRONOM name for each file extension pronom_conversion = { '.7z': { 'puid': 'fmt/484', 'name': '7Zip format' }, '.bz2': { 'puid': 'x-fmt/268', 'name': 'BZIP2 Compressed Archive' }, } num_files = 1 # Get size try: aip_size = os.path.getsize(aip_path) except os.error: print >> sys.stderr, "File {} does not exist or is inaccessible. Aborting.".format( aip_path) return -1 # Calculate checksum checksum_algorithm = 'sha256' checksum = checksummingTools.sha_for_file(aip_path) # Get package type (AIP, AIC) sip_metadata_uuid = '3e48343d-e2d2-4956-aaa3-b54d26eb9761' try: dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid, metadataappliestoidentifier=aip_uuid) except DublinCore.DoesNotExist: package_type = "Archival Information Package" else: package_type = dc.type # Namespaces nsmap = { # Default, unprefixed namespace None: namespaces.metsNS, 'xsi': namespaces.xsiNS, 'xlink': namespaces.xlinkNS, } # Set up structure E = ElementMaker(namespace=namespaces.metsNS, nsmap=nsmap) E_P = ElementMaker(namespace=namespaces.premisNS, nsmap={None: namespaces.premisNS}) root = ( E.mets( E.metsHdr(CREATEDATE=now), # amdSec goes here E.fileSec(E.fileGrp(USE='Archival Information Package'), ), E.structMap(TYPE='physical'), )) # Namespaced attributes have to be added separately - don't know how to do # inline with E root.attrib[namespaces.xsiBNS + 'schemaLocation'] = mets_schema_location add_amdsec_after = root.find('mets:metsHdr', namespaces=namespaces.NSMAP) filegrp = root.find('.//mets:fileGrp', namespaces=namespaces.NSMAP) structmap = root.find('.//mets:structMap', namespaces=namespaces.NSMAP) # For each file, add amdSec, file, fptr for admin_id in range(1, num_files + 1): # amdSec amdsec_id = 'amdSec_{}'.format(admin_id) amdsec = E.amdSec( E.techMD( E.mdWrap( E.xmlData(), MDTYPE='PREMIS:OBJECT', # mdWrap ), ID='techMD_1', # techMD ), ID=amdsec_id, # amdSec ) # Add PREMIS:OBJECT obj = E_P.object( E_P.objectIdentifier( E_P.objectIdentifierType('UUID'), E_P.objectIdentifierValue(aip_uuid), ), E_P.objectCharacteristics( E_P.compositionLevel('1'), E_P.fixity( E_P.messageDigestAlgorithm(checksum_algorithm), E_P.messageDigest(checksum), ), E_P.size(str(aip_size)), E_P.format( E_P.formatDesignation( E_P.formatName(pronom_conversion[extension]['name']), E_P.formatVersion(), ), E_P.formatRegistry( E_P.formatRegistryName('PRONOM'), E_P.formatRegistryKey( pronom_conversion[extension]['puid'])), ), E_P.creatingApplication( E_P.creatingApplicationName(archive_tool), E_P.creatingApplicationVersion(archive_tool_version), E_P.dateCreatedByApplication(now), ), ), version='2.2', ) obj.attrib[namespaces.xsiBNS + 'type'] = 'file' obj.attrib[namespaces.xsiBNS + 'schemaLocation'] = premis_schema_location # Add as child of xmldata amdsec.find('.//mets:mdWrap[@MDTYPE="PREMIS:OBJECT"]/mets:xmlData', namespaces=namespaces.NSMAP).append(obj) # Add PREMIS:EVENT for compression, use archivematicaCreateMETS2 code elements = archivematicaCreateMETS2.createDigiprovMD(aip_uuid) for element in elements: amdsec.append(element) # Add PREMIS:AGENT for Archivematica elements = archivematicaCreateMETS2.createDigiprovMDAgents() for element in elements: amdsec.append(element) # add amdSec after previous amdSec (or metsHdr if first one) add_amdsec_after.addnext(amdsec) add_amdsec_after = amdsec # fileGrp file_ = E.file(E.FLocat( LOCTYPE="OTHER", OTHERLOCTYPE="SYSTEM", ), ID=aip_identifier) filegrp.append(file_) flocat = file_.find('mets:FLocat', namespaces=namespaces.NSMAP) flocat.attrib['{{{ns}}}href'.format(ns=namespaces.xlinkNS)] = aip_path # compression - 7z or tar.bz2 if extension == '.7z': etree.SubElement(file_, "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM=algorithm) elif extension == '.bz2': etree.SubElement(file_, "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='bzip2') etree.SubElement(file_, "transformFile", TRANSFORMORDER='2', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='tar') # structMap div = etree.SubElement(structmap, namespaces.metsBNS + 'div', ADMID=amdsec_id, TYPE=package_type) etree.SubElement(div, namespaces.metsBNS + 'fptr', FILEID=aip_identifier) print etree.tostring(root, pretty_print=True) # Write out pointer.xml xml_filename = 'pointer.xml' filename = os.path.join(os.path.dirname(aip_path), xml_filename) with open(filename, 'w') as f: f.write(etree.tostring(root, pretty_print=True)) fileOperations.addFileToSIP( filePathRelativeToSIP='%SIPDirectory%' + xml_filename, fileUUID=str(uuid.uuid4()), sipUUID=aip_uuid, taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", ) return 0