def find_files(xmlfile, rootdir='', prefix='', skip_files=None): doc = etree.ElementTree(file=xmlfile) files = set() if skip_files is None: skip_files = [] for elname, props in six.iteritems(FILE_ELEMENTS): file_elements = doc.xpath('.//*[local-name()="%s"]' % elname) # Remove first object in premis file if it is a "fake" entry describing the tar if len(file_elements) and file_elements[0].get( '{%s}type' % XSI_NAMESPACE) == 'premis:file': if len(file_elements[0].xpath( './/*[local-name()="formatName"][. = "TAR"]')): file_elements.pop(0) for el in file_elements: file_el = XMLFileElement(el, props, rootdir=rootdir) file_el.path = win_to_posix(os.path.join(prefix, file_el.path)) if file_el.path in skip_files: continue files.add(file_el) for pointer in find_pointers(xmlfile=xmlfile): pointer_prefix = os.path.split(pointer.path)[0] if pointer.path not in skip_files: files.add(pointer) files |= find_files(os.path.join(rootdir, pointer.path), rootdir, pointer_prefix) return files
def parse_file(filepath, fid, relpath=None, algorithm='SHA-256', rootdir='', provided_data=None): if not relpath: relpath = filepath if provided_data is None: provided_data = {} relpath = win_to_posix(relpath) fileinfo = { 'FName': os.path.basename(relpath), 'FExtension': os.path.splitext(relpath)[1][1:], 'FDir': rootdir, 'FParentDir': os.path.basename(os.path.dirname(filepath)), 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': fid.get_mimetype(filepath), 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'ESSArch', 'FIDType': 'UUID', } # We only do heavy computations if their values aren't included in # provided_data if 'FCreated' not in provided_data: timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) fileinfo['FCreated'] = createdate.isoformat() if 'FChecksum' not in provided_data: fileinfo['FChecksum'] = checksum.calculate_checksum( filepath, algorithm) if 'FEncrypted' not in provided_data: fileinfo['FEncrypted'] = fid.identify_file_encryption(filepath) if any(x not in provided_data for x in ['FFormatName', 'FFormatVersion', 'FFormatRegistryKey']): (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) fileinfo['FFormatName'] = format_name fileinfo['FFormatVersion'] = format_version fileinfo['FFormatRegistryKey'] = format_registry_key for key, value in provided_data.items(): fileinfo[key] = value return fileinfo
def find_files(xmlfile, rootdir='', prefix='', skip_files=None, recursive=True, current_dir=None): doc = etree.ElementTree(file=xmlfile) files = set() if skip_files is None: skip_files = [] if current_dir is None: current_dir = rootdir for elname, props in FILE_ELEMENTS.items(): file_elements = doc.xpath('.//*[local-name()="%s"]' % elname) # Remove first object in premis file if it is a "fake" entry describing the tar if len(file_elements) and file_elements[0].get('{%s}type' % XSI_NAMESPACE) == 'premis:file': # In XPath 1 we use translate() to make a case insensitive comparison xpath_upper = 'translate(.,"abcdefghijklmnopqrstuvwxyz","ABCDEFGHIJKLMNOPQRSTUVWXYZ")' xpath_query = './/*[local-name()="formatName"][{up} = "TAR" or {up} = "ZIP"]'.format(up=xpath_upper) if len(file_elements[0].xpath(xpath_query)): file_elements.pop(0) for el in file_elements: file_el = XMLFileElement(el, props, rootdir=rootdir) file_el.path = win_to_posix(os.path.join(prefix, file_el.path)) if file_el.path in skip_files: continue files.add(file_el) if recursive: for pointer in find_pointers(xmlfile=xmlfile): current_dir = os.path.join(current_dir, os.path.dirname(pointer.path)) pointer_path = os.path.join(current_dir, os.path.basename(pointer.path)) if pointer.path not in skip_files: pointer.path = os.path.join(prefix, pointer.path) files.add(pointer) prefix = os.path.relpath(current_dir, rootdir) if prefix == '.': prefix = '' files |= find_files( pointer_path, rootdir, prefix, recursive=recursive, current_dir=current_dir, ) return files
def run(self, dirname=None, files=[], files_reldir=None, xmlfile=None): if dirname: xmlrelpath = os.path.relpath(xmlfile, dirname) xmlrelpath = remove_prefix(xmlrelpath, "./") else: xmlrelpath = xmlfile doc = etree.ElementTree(file=xmlfile) root = doc.getroot() logical_files = set() physical_files = set() for elname, props in settings.FILE_ELEMENTS.iteritems(): for f in doc.xpath('.//*[local-name()="%s"]' % elname): filename = get_value_from_path(f, props["path"]) if filename: filename = remove_prefix(filename, props.get("pathprefix", "")) logical_files.add(filename) if dirname: for root, dirs, filenames in os.walk(dirname): for f in filenames: if f != xmlrelpath: reldir = os.path.relpath(root, dirname) relfile = os.path.join(reldir, f) relfile = win_to_posix(relfile) relfile = remove_prefix(relfile, "./") physical_files.add(relfile) for f in files: if files_reldir: f = os.path.relpath(f, files_reldir) physical_files.add(f) assert logical_files == physical_files, "the logical representation differs from the physical" self.set_progress(100, total=100) return "Success"
def run(self, dirname=None, files=[], files_reldir=None, xmlfile=None, rootdir=""): if dirname: xmlrelpath = os.path.relpath(xmlfile, dirname) xmlrelpath = remove_prefix(xmlrelpath, "./") else: xmlrelpath = xmlfile logical_files = find_files(xmlfile, rootdir) physical_files = set() if dirname: for root, dirs, filenames in walk(dirname): for f in filenames: reldir = os.path.relpath(root, dirname) relfile = os.path.join(reldir, f) relfile = win_to_posix(relfile) relfile = remove_prefix(relfile, "./") if relfile != xmlrelpath: physical_files.add(relfile) for f in files: if files_reldir: if f == files_reldir: physical_files.add(os.path.basename(f)) continue f = os.path.relpath(f, files_reldir) physical_files.add(f) assert logical_files == physical_files, "the logical representation differs from the physical" return "Success"
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if not self.context: raise ValueError('A context (xml) is required') self.context = normalize_path(self.context) self.rootdir = self.options.get('rootdir') self.recursive = self.options.get('recursive', True) self.default_algorithm = self.options.get('default_algorithm', 'SHA-256') self.initial_present = {} # Map checksum -> fname self.initial_deleted = {} # Map checksum -> fname self.sizes = {} # Map fname -> size self.checksums = {} # Map fname -> checksum self.checksum_algorithms = {} # Map fname -> checksum algorithm self._get_files() for logical in self.logical_files: if self.rootdir is not None: logical_path = os.path.join(logical.path) else: logical_path = logical.path logical_path = win_to_posix(logical_path) try: self.initial_deleted[logical.checksum].append(logical_path) except KeyError: self.initial_deleted[logical.checksum] = [logical_path] try: self.initial_present[logical.checksum].append(logical_path) except KeyError: self.initial_present[logical.checksum] = [logical_path] self.checksums[logical_path] = logical.checksum self.checksum_algorithms[logical_path] = logical.checksum_type self.sizes[logical_path] = logical.size
def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256', rootdir=''): if not relpath: relpath = filepath relpath = win_to_posix(relpath) timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) checksum_task = ProcessTask( name="ESSArch_Core.tasks.CalculateChecksum", params={ "filename": filepath, "algorithm": algorithm }, processstep_id=self.step, responsible_id=self.responsible, information_package_id=self.ip) fileformat_task = ProcessTask( name="ESSArch_Core.tasks.IdentifyFileFormat", params={ "filename": filepath, }, processstep_id=self.step, responsible_id=self.responsible, information_package_id=self.ip) ProcessTask.objects.bulk_create([checksum_task, fileformat_task]) checksum = checksum_task.run().get() self.set_progress(50, total=100) (format_name, format_version, format_registry_key) = fileformat_task.run().get() fileinfo = { 'FName': os.path.basename(relpath), 'FDir': rootdir, 'FChecksum': checksum, 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': mimetype, 'FCreated': createdate.isoformat(), 'FFormatName': format_name, 'FFormatVersion': format_version, 'FFormatRegistryKey': format_registry_key, 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'hashlib', 'FLocationType': 'URI', 'FIDType': 'UUID', } return fileinfo
def normalize_paths(self, expected_file_names): return [win_to_posix(f) for f in expected_file_names]
def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256'): if not relpath: relpath = filepath relpath = win_to_posix(relpath) timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) checksum_task = ProcessTask.objects.create( name="preingest.tasks.CalculateChecksum", params={ "filename": filepath, "algorithm": algorithm } ) fileformat_task = ProcessTask.objects.create( name="preingest.tasks.IdentifyFileFormat", params={ "filename": filepath, } ) checksum_task.log = self.taskobj.log checksum_task.information_package = self.taskobj.information_package checksum_task.responsible = self.taskobj.responsible fileformat_task.log = self.taskobj.log fileformat_task.information_package = self.taskobj.information_package fileformat_task.responsible = self.taskobj.responsible if self.taskobj is not None and self.taskobj.processstep is not None: checksum_task.processstep = self.taskobj.processstep fileformat_task.processstep = self.taskobj.processstep checksum_task.save() fileformat_task.save() checksum = checksum_task.run_eagerly() self.set_progress(50, total=100) fileformat = fileformat_task.run_eagerly() fileinfo = { 'FName': os.path.basename(relpath), 'FChecksum': checksum, 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': mimetype, 'FCreated': createdate.isoformat(), 'FFormatName': fileformat, 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'hashlib', 'FLocationType': 'URI', 'FIDType': 'UUID', } self.set_progress(100, total=100) return fileinfo