示例#1
0
def find_files(xmlfile, rootdir='', prefix='', skip_files=None):
    doc = etree.ElementTree(file=xmlfile)
    files = set()

    if skip_files is None:
        skip_files = []

    for elname, props in six.iteritems(FILE_ELEMENTS):
        file_elements = doc.xpath('.//*[local-name()="%s"]' % elname)

        # Remove first object in premis file if it is a "fake" entry describing the tar
        if len(file_elements) and file_elements[0].get(
                '{%s}type' % XSI_NAMESPACE) == 'premis:file':
            if len(file_elements[0].xpath(
                    './/*[local-name()="formatName"][. = "TAR"]')):
                file_elements.pop(0)

        for el in file_elements:
            file_el = XMLFileElement(el, props, rootdir=rootdir)
            file_el.path = win_to_posix(os.path.join(prefix, file_el.path))

            if file_el.path in skip_files:
                continue

            files.add(file_el)

    for pointer in find_pointers(xmlfile=xmlfile):
        pointer_prefix = os.path.split(pointer.path)[0]
        if pointer.path not in skip_files:
            files.add(pointer)
        files |= find_files(os.path.join(rootdir, pointer.path), rootdir,
                            pointer_prefix)

    return files
示例#2
0
def parse_file(filepath,
               fid,
               relpath=None,
               algorithm='SHA-256',
               rootdir='',
               provided_data=None):
    if not relpath:
        relpath = filepath

    if provided_data is None:
        provided_data = {}

    relpath = win_to_posix(relpath)

    fileinfo = {
        'FName': os.path.basename(relpath),
        'FExtension': os.path.splitext(relpath)[1][1:],
        'FDir': rootdir,
        'FParentDir': os.path.basename(os.path.dirname(filepath)),
        'FID': str(uuid.uuid4()),
        'daotype': "borndigital",
        'href': relpath,
        'FMimetype': fid.get_mimetype(filepath),
        'FSize': str(os.path.getsize(filepath)),
        'FUse': 'Datafile',
        'FChecksumType': algorithm,
        'FLoctype': 'URL',
        'FLinkType': 'simple',
        'FChecksumLib': 'ESSArch',
        'FIDType': 'UUID',
    }

    # We only do heavy computations if their values aren't included in
    # provided_data

    if 'FCreated' not in provided_data:
        timestamp = creation_date(filepath)
        createdate = timestamp_to_datetime(timestamp)
        fileinfo['FCreated'] = createdate.isoformat()

    if 'FChecksum' not in provided_data:
        fileinfo['FChecksum'] = checksum.calculate_checksum(
            filepath, algorithm)

    if 'FEncrypted' not in provided_data:
        fileinfo['FEncrypted'] = fid.identify_file_encryption(filepath)

    if any(x not in provided_data
           for x in ['FFormatName', 'FFormatVersion', 'FFormatRegistryKey']):
        (format_name, format_version,
         format_registry_key) = fid.identify_file_format(filepath)

        fileinfo['FFormatName'] = format_name
        fileinfo['FFormatVersion'] = format_version
        fileinfo['FFormatRegistryKey'] = format_registry_key

    for key, value in provided_data.items():
        fileinfo[key] = value

    return fileinfo
示例#3
0
def find_files(xmlfile, rootdir='', prefix='', skip_files=None, recursive=True, current_dir=None):
    doc = etree.ElementTree(file=xmlfile)
    files = set()

    if skip_files is None:
        skip_files = []

    if current_dir is None:
        current_dir = rootdir

    for elname, props in FILE_ELEMENTS.items():
        file_elements = doc.xpath('.//*[local-name()="%s"]' % elname)

        # Remove first object in premis file if it is a "fake" entry describing the tar
        if len(file_elements) and file_elements[0].get('{%s}type' % XSI_NAMESPACE) == 'premis:file':
            # In XPath 1 we use translate() to make a case insensitive comparison
            xpath_upper = 'translate(.,"abcdefghijklmnopqrstuvwxyz","ABCDEFGHIJKLMNOPQRSTUVWXYZ")'
            xpath_query = './/*[local-name()="formatName"][{up} = "TAR" or {up} = "ZIP"]'.format(up=xpath_upper)
            if len(file_elements[0].xpath(xpath_query)):
                file_elements.pop(0)

        for el in file_elements:
            file_el = XMLFileElement(el, props, rootdir=rootdir)
            file_el.path = win_to_posix(os.path.join(prefix, file_el.path))

            if file_el.path in skip_files:
                continue

            files.add(file_el)

    if recursive:
        for pointer in find_pointers(xmlfile=xmlfile):
            current_dir = os.path.join(current_dir, os.path.dirname(pointer.path))
            pointer_path = os.path.join(current_dir, os.path.basename(pointer.path))

            if pointer.path not in skip_files:
                pointer.path = os.path.join(prefix, pointer.path)
                files.add(pointer)

            prefix = os.path.relpath(current_dir, rootdir)
            if prefix == '.':
                prefix = ''

            files |= find_files(
                pointer_path,
                rootdir,
                prefix,
                recursive=recursive,
                current_dir=current_dir,
            )

    return files
示例#4
0
    def run(self, dirname=None, files=[], files_reldir=None, xmlfile=None):
        if dirname:
            xmlrelpath = os.path.relpath(xmlfile, dirname)
            xmlrelpath = remove_prefix(xmlrelpath, "./")
        else:
            xmlrelpath = xmlfile

        doc = etree.ElementTree(file=xmlfile)

        root = doc.getroot()

        logical_files = set()
        physical_files = set()

        for elname, props in settings.FILE_ELEMENTS.iteritems():
            for f in doc.xpath('.//*[local-name()="%s"]' % elname):
                filename = get_value_from_path(f, props["path"])

                if filename:
                    filename = remove_prefix(filename, props.get("pathprefix", ""))
                    logical_files.add(filename)

        if dirname:
            for root, dirs, filenames in os.walk(dirname):
                for f in filenames:
                    if f != xmlrelpath:
                        reldir = os.path.relpath(root, dirname)
                        relfile = os.path.join(reldir, f)
                        relfile = win_to_posix(relfile)
                        relfile = remove_prefix(relfile, "./")

                        physical_files.add(relfile)

        for f in files:
            if files_reldir:
                f = os.path.relpath(f, files_reldir)
            physical_files.add(f)

        assert logical_files == physical_files, "the logical representation differs from the physical"
        self.set_progress(100, total=100)
        return "Success"
示例#5
0
    def run(self,
            dirname=None,
            files=[],
            files_reldir=None,
            xmlfile=None,
            rootdir=""):
        if dirname:
            xmlrelpath = os.path.relpath(xmlfile, dirname)
            xmlrelpath = remove_prefix(xmlrelpath, "./")
        else:
            xmlrelpath = xmlfile

        logical_files = find_files(xmlfile, rootdir)
        physical_files = set()

        if dirname:
            for root, dirs, filenames in walk(dirname):
                for f in filenames:
                    reldir = os.path.relpath(root, dirname)
                    relfile = os.path.join(reldir, f)
                    relfile = win_to_posix(relfile)
                    relfile = remove_prefix(relfile, "./")

                    if relfile != xmlrelpath:
                        physical_files.add(relfile)

        for f in files:
            if files_reldir:
                if f == files_reldir:
                    physical_files.add(os.path.basename(f))
                    continue

                f = os.path.relpath(f, files_reldir)
            physical_files.add(f)

        assert logical_files == physical_files, "the logical representation differs from the physical"
        return "Success"
示例#6
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if not self.context:
            raise ValueError('A context (xml) is required')

        self.context = normalize_path(self.context)
        self.rootdir = self.options.get('rootdir')
        self.recursive = self.options.get('recursive', True)
        self.default_algorithm = self.options.get('default_algorithm',
                                                  'SHA-256')

        self.initial_present = {}  # Map checksum -> fname
        self.initial_deleted = {}  # Map checksum -> fname
        self.sizes = {}  # Map fname -> size
        self.checksums = {}  # Map fname -> checksum
        self.checksum_algorithms = {}  # Map fname -> checksum algorithm

        self._get_files()
        for logical in self.logical_files:
            if self.rootdir is not None:
                logical_path = os.path.join(logical.path)
            else:
                logical_path = logical.path
            logical_path = win_to_posix(logical_path)

            try:
                self.initial_deleted[logical.checksum].append(logical_path)
            except KeyError:
                self.initial_deleted[logical.checksum] = [logical_path]
            try:
                self.initial_present[logical.checksum].append(logical_path)
            except KeyError:
                self.initial_present[logical.checksum] = [logical_path]
            self.checksums[logical_path] = logical.checksum
            self.checksum_algorithms[logical_path] = logical.checksum_type
            self.sizes[logical_path] = logical.size
示例#7
0
    def run(self,
            filepath=None,
            mimetype=None,
            relpath=None,
            algorithm='SHA-256',
            rootdir=''):
        if not relpath:
            relpath = filepath

        relpath = win_to_posix(relpath)

        timestamp = creation_date(filepath)
        createdate = timestamp_to_datetime(timestamp)

        checksum_task = ProcessTask(
            name="ESSArch_Core.tasks.CalculateChecksum",
            params={
                "filename": filepath,
                "algorithm": algorithm
            },
            processstep_id=self.step,
            responsible_id=self.responsible,
            information_package_id=self.ip)

        fileformat_task = ProcessTask(
            name="ESSArch_Core.tasks.IdentifyFileFormat",
            params={
                "filename": filepath,
            },
            processstep_id=self.step,
            responsible_id=self.responsible,
            information_package_id=self.ip)

        ProcessTask.objects.bulk_create([checksum_task, fileformat_task])

        checksum = checksum_task.run().get()
        self.set_progress(50, total=100)
        (format_name, format_version,
         format_registry_key) = fileformat_task.run().get()

        fileinfo = {
            'FName': os.path.basename(relpath),
            'FDir': rootdir,
            'FChecksum': checksum,
            'FID': str(uuid.uuid4()),
            'daotype': "borndigital",
            'href': relpath,
            'FMimetype': mimetype,
            'FCreated': createdate.isoformat(),
            'FFormatName': format_name,
            'FFormatVersion': format_version,
            'FFormatRegistryKey': format_registry_key,
            'FSize': str(os.path.getsize(filepath)),
            'FUse': 'Datafile',
            'FChecksumType': algorithm,
            'FLoctype': 'URL',
            'FLinkType': 'simple',
            'FChecksumLib': 'hashlib',
            'FLocationType': 'URI',
            'FIDType': 'UUID',
        }

        return fileinfo
示例#8
0
 def normalize_paths(self, expected_file_names):
     return [win_to_posix(f) for f in expected_file_names]
示例#9
0
    def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256'):
        if not relpath:
            relpath = filepath

        relpath = win_to_posix(relpath)

        timestamp = creation_date(filepath)
        createdate = timestamp_to_datetime(timestamp)

        checksum_task = ProcessTask.objects.create(
            name="preingest.tasks.CalculateChecksum",
            params={
                "filename": filepath,
                "algorithm": algorithm
            }
        )

        fileformat_task = ProcessTask.objects.create(
            name="preingest.tasks.IdentifyFileFormat",
            params={
                "filename": filepath,
            }
        )

        checksum_task.log = self.taskobj.log
        checksum_task.information_package = self.taskobj.information_package
        checksum_task.responsible = self.taskobj.responsible

        fileformat_task.log = self.taskobj.log
        fileformat_task.information_package = self.taskobj.information_package
        fileformat_task.responsible = self.taskobj.responsible

        if self.taskobj is not None and self.taskobj.processstep is not None:
            checksum_task.processstep = self.taskobj.processstep
            fileformat_task.processstep = self.taskobj.processstep

        checksum_task.save()
        fileformat_task.save()

        checksum = checksum_task.run_eagerly()
        self.set_progress(50, total=100)
        fileformat = fileformat_task.run_eagerly()

        fileinfo = {
            'FName': os.path.basename(relpath),
            'FChecksum': checksum,
            'FID': str(uuid.uuid4()),
            'daotype': "borndigital",
            'href': relpath,
            'FMimetype': mimetype,
            'FCreated': createdate.isoformat(),
            'FFormatName': fileformat,
            'FSize': str(os.path.getsize(filepath)),
            'FUse': 'Datafile',
            'FChecksumType': algorithm,
            'FLoctype': 'URL',
            'FLinkType': 'simple',
            'FChecksumLib': 'hashlib',
            'FLocationType': 'URI',
            'FIDType': 'UUID',
        }

        self.set_progress(100, total=100)

        return fileinfo