Exemplo n.º 1
0
 def test_custom_list(self, mock_mimetypes_init, isfile):
     mimetypes_file = Path.objects.create(
         entity="path_mimetypes_definitionfile", value='path/to/mime.types')
     fid = FormatIdentifier(allow_unknown_file_types=True)
     fid._init_mimetypes()
     mock_mimetypes_init.assert_called_once_with(
         files=[mimetypes_file.value])
Exemplo n.º 2
0
    def test_handle_matches_when_no_matches(self, mock_mimetypes_init):
        fid = FormatIdentifier(allow_unknown_file_types=True)
        fid.handle_matches('fullname', [], mock.ANY)

        self.assertEqual(fid.format_name, 'Unknown File Format')
        self.assertEqual(fid.format_version, None)
        self.assertEqual(fid.format_registry_key, None)
Exemplo n.º 3
0
    def test_handle_matches_when_no_match_on_name_or_version_or_reg_key(
            self, mock_mimetypes_init):
        fid = FormatIdentifier(allow_unknown_file_types=True)
        dummy_matches = [('a', 'b'), ('c', 'd')]

        fid.handle_matches('fullname', dummy_matches, mock.ANY)

        self.assertEqual(fid.format_name, None)
        self.assertEqual(fid.format_version, None)
        self.assertEqual(fid.format_registry_key, None)
Exemplo n.º 4
0
    def setUp(self):
        self.content = b'test file'
        self.test_file = tempfile.NamedTemporaryFile(suffix='.txt', delete=False)
        self.addCleanup(os.remove, self.test_file.name)

        self.test_file.write(self.content)
        self.test_file.seek(0)
        self.test_file.close()

        fid = FormatIdentifier()
        self.expected = fid.identify_file_format(self.test_file.name)
Exemplo n.º 5
0
class FormatValidator(BaseValidator):
    """
    Validates the format of a file against the given ``context``.
    """
    def __init__(self, *args, **kwargs):
        super(FormatValidator, self).__init__(*args, **kwargs)

        allow_unknown = self.options.get('allow_unknown_file_types', False)
        self.fid = FormatIdentifier(allow_unknown_file_types=allow_unknown)

    def validate(self, filepath, expected=None):
        logger.debug('Validating format of %s' % filepath)

        name, version, reg_key = expected
        if not any(f is not None for f in (name, version, reg_key)):
            raise ValueError(
                'At least one of name, version and registry key is required')

        val_obj = Validation.objects.create(filename=filepath,
                                            time_started=timezone.now(),
                                            validator=self.__class__.__name__,
                                            required=self.required,
                                            task=self.task,
                                            information_package=self.ip,
                                            responsible=self.responsible,
                                            specification={
                                                'context': self.context,
                                                'options': self.options,
                                            })

        passed = False
        try:
            actual_name, actual_version, actual_reg_key = self.fid.identify_file_format(
                filepath)
            if name and name != actual_name:
                raise ValidationError(
                    "format name for {} is not valid, ({} !={})".format(
                        filepath, name, actual_name))
            if version and version != actual_version:
                raise ValidationError(
                    "format version for {} is not valid, ({} != {})".format(
                        filepath, version, actual_version))
            if reg_key and reg_key != actual_reg_key:
                raise ValidationError(
                    "format registry key for {} is not valid, ({} != {})".
                    format(filepath, reg_key, actual_reg_key))

            passed = True
        except ValidationError:
            val_obj.message = traceback.format_exc()
            raise
        else:
            message = 'Successfully validated checksum of %s' % filepath
            val_obj.message = message
            logger.info(message)
        finally:
            val_obj.time_done = timezone.now()
            val_obj.passed = passed
            val_obj.save(update_fields=['time_done', 'passed', 'message'])
Exemplo n.º 6
0
    def __init__(self, filepath=None):
        self.parser = etree.XMLParser(remove_blank_text=True)
        self.fid = FormatIdentifier(allow_unknown_file_types=False)

        if filepath is not None:
            self.tree = etree.parse(filepath, parser=self.parser)
        else:
            self.tree = None
Exemplo n.º 7
0
def index_document(tag_version, filepath):
    exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT

    fid = FormatIdentifier()
    (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath)
    if format_registry_key not in exclude_file_format_from_indexing_content:
        index_file_content = True
    else:
        index_file_content = False

    ip = tag_version.tag.information_package
    extension = os.path.splitext(tag_version.name)[1][1:]
    dirname = os.path.dirname(filepath)
    href = normalize_path(os.path.relpath(dirname, ip.object_path))
    href = '' if href == '.' else href
    size, _ = get_tree_size_and_count(filepath)
    modified = timestamp_to_datetime(os.stat(filepath).st_mtime)

    tag_version.custom_fields = {
        'extension': extension,
        'dirname': dirname,
        'href': href,
        'filename': tag_version.name,
        'size': size,
        'modified': modified,
        'formatname': format_name,
        'formatversion': format_version,
        'formatkey': format_registry_key,
    }

    doc = File.from_obj(tag_version)

    try:
        if index_file_content:
            with open(filepath, 'rb') as f:
                content = f.read()
            doc.data = base64.b64encode(content).decode("ascii")
            doc.save(pipeline='ingest_attachment')
        else:
            logger.debug('Skip to index file content for {}'.format(filepath))
            doc.save()
    except ElasticsearchException:
        logger.exception('Failed to index {}'.format(filepath))
        raise
    return doc, tag_version
Exemplo n.º 8
0
def validate_file_format(filename, format_name, format_registry_key,
                         format_version):
    """
    Validates the format of the given file
    """

    fid = FormatIdentifier()
    actual_format_name, actual_format_version, actual_format_registry_key = fid.identify_file_format(
        filename)

    if format_name:
        assert actual_format_name == format_name, (
            "format name for %s is not valid, (%s != %s)" % filename,
            format_name, actual_format_name)

    if format_version:
        assert actual_format_version == format_version, "format version for %s is not valid" % filename

    if format_registry_key:
        assert actual_format_registry_key == format_registry_key, (
            "format registry key for %s is not valid" % filename)

    return "Success"
Exemplo n.º 9
0
    def get_path_response(self,
                          path,
                          request,
                          force_download=False,
                          paginator=None):
        self.validate_path(path)
        try:
            if not path:
                raise OSError(errno.EISDIR, os.strerror(errno.EISDIR), path)

            if os.path.isfile(self.object_path):
                container_path = os.path.join(
                    os.path.dirname(self.object_path),
                    path.split('/', 1)[0])
                container_path = normalize_path(container_path)
                if container_path == self.object_path:
                    path = path.split('/', 1)[1]

            fid = FormatIdentifier(allow_unknown_file_types=True)
            content_type = fid.get_mimetype(path)
            return generate_file_response(self.open_file(path, 'rb'),
                                          content_type,
                                          force_download=force_download,
                                          name=path)
        except (IOError, OSError) as e:
            if e.errno == errno.ENOENT:
                raise exceptions.NotFound

            # Windows raises PermissionDenied (errno.EACCES) when trying to use
            # open() on a directory
            if os.name == 'nt':
                if e.errno not in (errno.EACCES, errno.EISDIR):
                    raise
            elif e.errno != errno.EISDIR:
                raise
        except IndexError:
            if force_download:
                fid = FormatIdentifier(allow_unknown_file_types=True)
                content_type = fid.get_mimetype(path)
                return generate_file_response(self.open_file(
                    self.object_path, 'rb'),
                                              content_type,
                                              force_download=force_download,
                                              name=path)

        entries = self.list_files(path)
        if paginator is not None:
            paginated = paginator.paginate_queryset(entries, request)
            return paginator.get_paginated_response(paginated)
        return Response(entries)
Exemplo n.º 10
0
 def test_default_list(self, mock_mimetypes_init):
     fid = FormatIdentifier(allow_unknown_file_types=True)
     fid._init_mimetypes()
     mock_mimetypes_init.assert_called_once_with()
Exemplo n.º 11
0
 def test_gzipped_file(self, mock_mimetypes_init):
     fid = FormatIdentifier(allow_unknown_file_types=True)
     self.assertEqual(fid.get_mimetype('foo.tar.gz'), 'application/gzip')
Exemplo n.º 12
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        allow_unknown = self.options.get('allow_unknown_file_types', False)
        self.fid = FormatIdentifier(allow_unknown_file_types=allow_unknown)
Exemplo n.º 13
0
def list_files(path, force_download=False, request=None, paginator=None):
    if isinstance(path, list):
        if paginator is not None:
            paginated = paginator.paginate_queryset(path, request)
            return paginator.get_paginated_response(paginated)
        return Response(path)

    fid = FormatIdentifier(allow_unknown_file_types=True)
    path = path.rstrip('/ ')

    if os.path.isfile(path):
        if tarfile.is_tarfile(path):
            with tarfile.open(path) as tar:
                entries = []
                for member in tar.getmembers():
                    if not member.isfile():
                        continue

                    entries.append({
                        "name": member.name,
                        "type": 'file',
                        "size": member.size,
                        "modified": timestamp_to_datetime(member.mtime),
                    })
                if paginator is not None:
                    paginated = paginator.paginate_queryset(entries, request)
                    return paginator.get_paginated_response(paginated)
                return Response(entries)

        elif zipfile.is_zipfile(path) and os.path.splitext(path)[1] == '.zip':
            with zipfile.ZipFile(path) as zipf:
                entries = []
                for member in zipf.filelist:
                    if member.filename.endswith('/'):
                        continue

                    entries.append({
                        "name": member.filename,
                        "type": 'file',
                        "size": member.file_size,
                        "modified": datetime(*member.date_time),
                    })
                if paginator is not None:
                    paginated = paginator.paginate_queryset(entries, request)
                    return paginator.get_paginated_response(paginated)
                return Response(entries)

        content_type = fid.get_mimetype(path)
        return generate_file_response(open(path, 'rb'), content_type, force_download)

    if os.path.isdir(path):
        entries = []
        for entry in sorted(get_files_and_dirs(path), key=lambda x: x.name):
            entry_type = "dir" if entry.is_dir() else "file"
            size, _ = get_tree_size_and_count(entry.path)

            entries.append(
                {
                    "name": os.path.basename(entry.path),
                    "type": entry_type,
                    "size": size,
                    "modified": timestamp_to_datetime(entry.stat().st_mtime),
                }
            )

        if paginator is not None and request is not None:
            paginated = paginator.paginate_queryset(entries, request)
            return paginator.get_paginated_response(paginated)

    if len(path.split('.tar/')) == 2:
        tar_path, tar_subpath = path.split('.tar/')
        tar_path += '.tar'

        with tarfile.open(tar_path) as tar:
            try:
                f = io.BytesIO(tar.extractfile(tar_subpath).read())
                content_type = fid.get_mimetype(tar_subpath)
                return generate_file_response(f, content_type, force_download, name=tar_subpath)
            except KeyError:
                raise NotFound

    if len(path.split('.zip/')) == 2:
        zip_path, zip_subpath = path.split('.zip/')
        zip_path += '.zip'

        with zipfile.ZipFile(zip_path) as zipf:
            try:
                f = io.BytesIO(zipf.read(zip_subpath))
                content_type = fid.get_mimetype(zip_subpath)
                return generate_file_response(f, content_type, force_download, name=zip_subpath)
            except KeyError:
                raise NotFound

    raise NotFound
Exemplo n.º 14
0
 def cli(path):
     fid = FormatIdentifier()
     res = fid.identify_file_format(path)
     click.echo(res)
Exemplo n.º 15
0
    def test_handle_matches_when_no_matches_and_unknown_types_not_allowed(
            self, mock_mimetypes_init):
        fid = FormatIdentifier(allow_unknown_file_types=False)

        with self.assertRaises(ValueError):
            fid.handle_matches('fullname', [], mock.ANY)
Exemplo n.º 16
0
 def test_unknown_content_type_when_not_allowed_should_raise_exception(
         self, mock_mimetypes_init):
     fid = FormatIdentifier(allow_unknown_file_types=False)
     with self.assertRaises(FileFormatNotAllowed):
         fid.get_mimetype('some_random_file')
Exemplo n.º 17
0
 def test_unknown_content_type(self, mock_mimetypes_init):
     fid = FormatIdentifier(allow_unknown_file_types=True)
     self.assertEqual(fid.get_mimetype('some_random_file'),
                      DEFAULT_MIMETYPE)
Exemplo n.º 18
0
    def fid(self):
        if self._fid is not None:
            return self._fid

        self._fid = FormatIdentifier()