def updateSizeAndChecksum(fileUUID, filePath, date, eventIdentifierUUID, fileSize=None, checksum=None, checksumType=None, add_event=True): """ Update a File with its size, checksum and checksum type. These are parameters that can be either generated or provided via keywords. Finally, insert the corresponding Event. This behavior can be cancelled using the boolean keyword 'add_event'. """ if not fileSize: fileSize = os.path.getsize(filePath) if not checksumType: checksumType = get_setting('checksum_type', 'sha256') if not checksum: checksum = get_file_checksum(filePath, checksumType) File.objects.filter(uuid=fileUUID).update(size=fileSize, checksum=checksum, checksumtype=checksumType) if add_event: insertIntoEvents( fileUUID=fileUUID, eventType='message digest calculation', eventDateTime=date, eventDetail='program="python"; module="hashlib.{}()"'.format( checksumType), eventOutcomeDetailNote=checksum)
def verifyMetsFileSecChecksums(job, metsFile, date, taskUUID, relativeDirectory="./"): job.print_output(metsFile) exitCode = 0 tree = etree.parse(metsFile) root = tree.getroot() for item in root.findall("{http://www.loc.gov/METS/}fileSec/{http://www.loc.gov/METS/}fileGrp/{http://www.loc.gov/METS/}file"): checksum = item.get("CHECKSUM") checksumType = item.get('CHECKSUMTYPE', '').lower() for item2 in item: if item2.tag == "{http://www.loc.gov/METS/}FLocat": fileLocation = item2.get("{http://www.w3.org/1999/xlink}href") fileFullPath = os.path.join(relativeDirectory, fileLocation) if checksumType and checksumType in hashlib.algorithms: checksum2 = get_file_checksum(fileFullPath, checksumType) # eventDetail = 'program="python"; module="hashlib.{}()"'.format(checksumType) else: job.pyprint("Unsupported checksum type: %s" % (checksumType.__str__()), file=sys.stderr) return 300 if checksum != checksum2: eventOutcome = "Fail" job.print_output("%s - %s - %s" % ((checksum == checksum2).__str__(), checksum.__str__(), checksum2.__str__())) job.print_error(eventOutcome, fileFullPath) exitCode = exitCode + 22 else: eventOutcome = "Pass" job.print_output(eventOutcome, fileLocation) return exitCode
def verify_checksum( job, file_uuid, path, checksum, checksumtype, event_id=None, date=None ): """ Verify the checksum of a given file, and create a fixity event. :param str file_uuid: UUID of the file to verify :param str path: Path of the file to verify :param str checksum: Checksum to compare against :param str checksumtype: Type of the provided checksum (md5, sha256, etc) :param str event_id: Event ID :param str date: Date of the event """ if event_id is None: event_id = str(uuid.uuid4()) if date is None: date = timezone.now().isoformat(" ") checksumtype = checksumtype.lower() generated_checksum = get_file_checksum(path, checksumtype) event_detail = 'program="python"; ' 'module="hashlib.{}()"'.format(checksumtype) if checksum != generated_checksum: job.pyprint("Checksum failed") event_outcome = "Fail" detail_note = "Dataverse checksum %s verification failed" % checksum else: job.pyprint("Checksum passed") event_outcome = "Pass" detail_note = "Dataverse checksum %s verified" % checksum databaseFunctions.insertIntoEvents( fileUUID=file_uuid, eventIdentifierUUID=event_id, eventType="fixity check", eventDateTime=date, eventDetail=event_detail, eventOutcome=event_outcome, eventOutcomeDetailNote=detail_note, )
def verifyChecksum(fileUUID, filePath, date, eventIdentifierUUID): f = File.objects.get(uuid=fileUUID) if f.checksum in ('', 'None'): print('No checksum found in database for file:', fileUUID, filePath, file=sys.stderr) exit(1) checksumFile = get_file_checksum(filePath, f.checksumtype) eventOutcome = '' eventOutcomeDetailNote = '' exitCode = 0 if checksumFile != f.checksum: eventOutcomeDetailNote = str(checksumFile) + ' != ' + f.checksum eventOutcome = 'Fail' exitCode = 2 print('Checksums do not match:', fileUUID, filePath, file=sys.stderr) print(eventOutcomeDetailNote, file=sys.stderr) else: eventOutcomeDetailNote = '%s %s' % (str(checksumFile), 'verified') eventOutcome = 'Pass' exitCode = 0 databaseFunctions.insertIntoEvents( fileUUID=fileUUID, eventIdentifierUUID=str(uuid.uuid4()), eventType='fixity check', eventDateTime=date, eventOutcome=eventOutcome, eventOutcomeDetailNote=eventOutcomeDetailNote, eventDetail='program="python"; module="hashlib.{}()"'.format( f.checksumtype)) exit(exitCode)
def call(jobs): with transaction.atomic(): for job in jobs: with job.JobContext(): # job.args[2] (transferName) is unused. transferUUID = job.args[1] transferPath = job.args[3] date = job.args[4] exitCode = 0 for transfer_dir in os.listdir(transferPath): dirPath = os.path.join(transferPath, transfer_dir) if not os.path.isdir(dirPath): continue for transfer_file in os.listdir(dirPath): filePath = os.path.join(dirPath, transfer_file) if transfer_file == 'ContainerMetadata.xml' or transfer_file.endswith( 'Metadata.xml' ) or not os.path.isfile(filePath): continue i = transfer_file.rfind('.') if i != -1: xmlFile = transfer_file[:i] + '_Metadata.xml' else: xmlFile = transfer_file + '_Metadata.xml' xmlFilePath = os.path.join(dirPath, xmlFile) try: tree = etree.parse(xmlFilePath) root = tree.getroot() xmlMD5 = root.find('Document/MD5').text except: job.pyprint('Error parsing: ', xmlFilePath, file=sys.stderr) exitCode += 1 continue objectMD5 = get_file_checksum(filePath, 'md5') if objectMD5 == xmlMD5: job.pyprint( 'File OK: ', xmlMD5, filePath.replace(transferPath, '%TransferDirectory%')) fileID = getFileUUIDLike(filePath, transferPath, transferUUID, 'transfer', '%transferDirectory%') for path, fileUUID in fileID.items(): eventDetail = 'program="python"; module="hashlib.md5()"' eventOutcome = 'Pass' eventOutcomeDetailNote = '%s %s' % ( xmlFile.__str__(), 'verified') eventIdentifierUUID = uuid.uuid4().__str__() databaseFunctions.insertIntoEvents( fileUUID=fileUUID, eventIdentifierUUID=eventIdentifierUUID, eventType='fixity check', eventDateTime=date, eventOutcome=eventOutcome, eventOutcomeDetailNote= eventOutcomeDetailNote, eventDetail=eventDetail) else: job.pyprint('Checksum mismatch: ', filePath.replace( transferPath, '%TransferDirectory%'), file=sys.stderr) exitCode += 1 job.set_status(exitCode)
def main(aip_uuid, aip_name, compression, sip_dir, aip_filename): # Prep work mets_schema_location = 'http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd' premis_schema_location = 'info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-2.xsd' # Datetime format string from http://docs.python.org/2/library/datetime.html # %Y = 4 digit year, %m = 2 digit month, %d = 2 digit day # %H = 24-hour hour, %M = 2-digit minute, %S = 2 digit second now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") aip_identifier = aip_name+'-'+aip_uuid aip_path = os.path.join(sip_dir, aip_filename) # Get archive tool and version program, algorithm = compression.split('-') # Pointer files are not written for uncompressed AIPs; # the purpose of the pointer file is primarily to provide information # on how to read a compressed AIP file, so there isn't anything for # it to do when pointing at an uncompressed AIP. if program == 'None': return 0 if program == '7z': archive_tool = '7-Zip' archive_tool_version = '9.20' # TODO get this dynamically elif program == 'pbzip2': archive_tool = program archive_tool_version = '1.1.6' # TODO get this dynamically # Format / file extension _, extension = os.path.splitext(aip_filename) # PRONOM ID and PRONOM name for each file extension pronom_conversion = { '.7z': {'puid': 'fmt/484', 'name': '7Zip format'}, '.bz2': {'puid': 'x-fmt/268', 'name': 'BZIP2 Compressed Archive'}, } num_files = 1 # Get size try: aip_size = os.path.getsize(aip_path) except os.error: print("File {} does not exist or is inaccessible. Aborting.".format(aip_path), file=sys.stderr) return -1 # Calculate checksum checksum_algorithm = get_setting('checksum_type', 'sha256') checksum = get_file_checksum(aip_path, checksum_algorithm) # Get package type (AIP, AIC) sip_metadata_uuid = '3e48343d-e2d2-4956-aaa3-b54d26eb9761' try: dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid, metadataappliestoidentifier=aip_uuid) except DublinCore.DoesNotExist: package_type = "Archival Information Package" else: package_type = dc.type # Namespaces nsmap = { # Default, unprefixed namespace 'mets': namespaces.metsNS, 'xsi': namespaces.xsiNS, 'xlink': namespaces.xlinkNS, } # Set up structure E = ElementMaker(namespace=namespaces.metsNS, nsmap=nsmap) E_P = ElementMaker(namespace=namespaces.premisNS, nsmap={'premis': namespaces.premisNS}) root = ( E.mets( E.metsHdr(CREATEDATE=now), # amdSec goes here E.fileSec( E.fileGrp(USE='Archival Information Package'), ), E.structMap( TYPE='physical' ), ) ) # Namespaced attributes have to be added separately - don't know how to do # inline with E root.attrib[namespaces.xsiBNS+'schemaLocation'] = mets_schema_location add_amdsec_after = root.find('mets:metsHdr', namespaces=namespaces.NSMAP) filegrp = root.find('.//mets:fileGrp', namespaces=namespaces.NSMAP) structmap = root.find('.//mets:structMap', namespaces=namespaces.NSMAP) # For each file, add amdSec, file, fptr for admin_id in range(1, num_files+1): # amdSec amdsec_id = 'amdSec_{}'.format(admin_id) amdsec = E.amdSec( E.techMD( E.mdWrap( E.xmlData( ), MDTYPE='PREMIS:OBJECT', # mdWrap ), ID='techMD_1', # techMD ), ID=amdsec_id, # amdSec ) # Add PREMIS:OBJECT obj = E_P.object( E_P.objectIdentifier( E_P.objectIdentifierType('UUID'), E_P.objectIdentifierValue(aip_uuid), ), E_P.objectCharacteristics( E_P.compositionLevel('1'), E_P.fixity( E_P.messageDigestAlgorithm(checksum_algorithm), E_P.messageDigest(checksum), ), E_P.size(str(aip_size)), E_P.format( E_P.formatDesignation( E_P.formatName( pronom_conversion[extension]['name']), E_P.formatVersion(), ), E_P.formatRegistry( E_P.formatRegistryName('PRONOM'), E_P.formatRegistryKey( pronom_conversion[extension]['puid']) ), ), E_P.creatingApplication( E_P.creatingApplicationName(archive_tool), E_P.creatingApplicationVersion(archive_tool_version), E_P.dateCreatedByApplication(now), ), ), version='2.2', ) obj.attrib[namespaces.xsiBNS+'type'] = 'premis:file' obj.attrib[namespaces.xsiBNS+'schemaLocation'] = premis_schema_location # Add as child of xmldata amdsec.find('.//mets:mdWrap[@MDTYPE="PREMIS:OBJECT"]/mets:xmlData', namespaces=namespaces.NSMAP).append(obj) # Add PREMIS:EVENT for compression & PREMIS:AGENTs # use archivematicaCreateMETS2 code elements = archivematicaCreateMETS2.createDigiprovMD(aip_uuid) for element in elements: amdsec.append(element) # add amdSec after previous amdSec (or metsHdr if first one) add_amdsec_after.addnext(amdsec) add_amdsec_after = amdsec # fileGrp file_ = E.file( E.FLocat( LOCTYPE="OTHER", OTHERLOCTYPE="SYSTEM", ), ID=aip_identifier ) filegrp.append(file_) flocat = file_.find('mets:FLocat', namespaces=namespaces.NSMAP) flocat.attrib['{{{ns}}}href'.format(ns=namespaces.xlinkNS)] = aip_path # compression - 7z or tar.bz2 if extension == '.7z': etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM=algorithm) elif extension == '.bz2': etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='bzip2') etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='2', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='tar') # structMap div = etree.SubElement(structmap, namespaces.metsBNS+'div', ADMID=amdsec_id, TYPE=package_type) etree.SubElement(div, namespaces.metsBNS+'fptr', FILEID=aip_identifier) print(etree.tostring(root, pretty_print=True)) # Write out pointer.xml xml_filename = 'pointer.xml' filename = os.path.join(os.path.dirname(aip_path), xml_filename) with open(filename, 'w') as f: f.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='utf-8')) fileOperations.addFileToSIP( filePathRelativeToSIP='%SIPDirectory%'+xml_filename, fileUUID=str(uuid.uuid4()), sipUUID=aip_uuid, taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", ) return 0
if i != -1: xmlFile = transfer_file[:i] + '_Metadata.xml' else: xmlFile = transfer_file + '_Metadata.xml' xmlFilePath = os.path.join(dirPath, xmlFile) try: tree = etree.parse(xmlFilePath) root = tree.getroot() xmlMD5 = root.find('Document/MD5').text except: print('Error parsing: ', xmlFilePath, file=sys.stderr) exitCode += 1 continue objectMD5 = get_file_checksum(filePath, 'md5') if objectMD5 == xmlMD5: print('File OK: ', xmlMD5, filePath.replace(transferPath, '%TransferDirectory%')) fileID = getFileUUIDLike(filePath, transferPath, transferUUID, 'transfer', '%transferDirectory%') for path, fileUUID in fileID.items(): eventDetail = 'program="python"; module="hashlib.md5()"' eventOutcome = 'Pass' eventOutcomeDetailNote = '%s %s' % (xmlFile.__str__(), 'verified') eventIdentifierUUID = uuid.uuid4().__str__() databaseFunctions.insertIntoEvents( fileUUID=fileUUID, eventIdentifierUUID=eventIdentifierUUID, eventType='fixity check',