Exemplo n.º 1
0
def detect_local_source(path, content, mime_type=None, encoding=None):

    # TODO: may add sample_size

    filename = os.path.basename(path)
    parts = filename.split('.')
    extension = parts[-1] if len(parts) > 1 else None

    if magic is not None:
        detected = magic.detect_from_content(content)
        encoding = detected.encoding or encoding
        mime_name = detected.name
        mime_type = detected.mime_type or mime_type

    else:
        encoding = chardet.detect(content)['encoding'] or encoding
        mime_name = None
        mime_type = mime_type or mimetypes.guess_type(filename)[0]

    plugin_name = plugin_name_by_mime_type(mime_type, mime_name, extension)
    if encoding == 'binary':
        encoding = None

    return Source(uri=path,
                  plugin_name=plugin_name,
                  encoding=encoding)
Exemplo n.º 2
0
def download_file(uri, verify_ssl):
    response = requests.get(uri, verify=verify_ssl)
    content = response.content
    if magic is not None:
        encoding = magic.detect_from_content(content).encoding
    else:
        encoding = response.encoding

    # TODO: try to guess with uri.split('/')[-1].split('.')[-1].lower()
    # TODO: try to guess with file-magic lib
    try:
        content_type = response.headers['content-type']
        plugin_name = content_type.split('/')[-1].split(';')[0].lower()
    except (KeyError, IndexError):
        try:
            plugin_name = uri.split('/')[-1].split('.')[-1].lower()
        except IndexError:
            raise RuntimeError('Could not identify file type.')

    tmp = tempfile.NamedTemporaryFile()
    filename = '{}.{}'.format(tmp.name, plugin_name)
    tmp.close()
    with open(filename, 'wb') as fobj:
        fobj.write(content)

    return {
        'filename': filename,
        'encoding': encoding,
    }
Exemplo n.º 3
0
def get_magics(path):
    """Get file format and encoding.

    The magic library is not really good at detecting text file-based format
    like CSV, JSON, YAML or, XML so we only use it to detect binary format and
    the encoding.

    Support both file-magic and magic as both as shipped under the same name
    in various distributions.

    """
    if not MAGIC:
        return None, None

    with open(path, 'rb') as file:
        header = file.read(2048)

    res = magic.detect_from_content(header)
    mime_to_format = {
        'application/pgp': 'gpg',
        'application/x-sqlite3': 'sqlite3'
    }
    name_to_format = {'KDBX': 'kdbx', 'openssl': 'openssl', 'PGP': 'gpg'}

    frmt = mime_to_format.get(res.mime_type, None)
    for name in name_to_format:
        if name in res.name:
            frmt = name_to_format[name]

    encoding = None  # res.encoding
    if 'UTF-8 Unicode (with BOM)' in res.name:
        encoding = 'utf-8-sig'

    return frmt, encoding
Exemplo n.º 4
0
 def readdir(self, path, fh):
     dirents = ['.', '..']
     if path in '/':
         for gpx in self.track_dir:
             dirents.extend([gpx])
     else:
         tid = self.track_dir[path[1:]][0]
         data = requests.get(_urlget % tid, auth=(self.user, self.password))
         logging.debug(data.headers['Content-Disposition'])
         filename = re.sub(r'.*filename="([^"]+?)".*', r'\1',
                           data.headers['Content-Disposition'])
         logging.debug('filename: {}'.format(filename))
         extension = re.match(r'([^.]+)(\..*)$', filename).groups()[1]
         self.track_dir[path[1:]][1] = data
         detected_type = magic.detect_from_content(data.content).mime_type
         type2fun = {'application/x-bzip2': bz2.decompress,
                     'application/x-gzip': gzip.decompress,
                     'text/xml': bytes,
                     'text/plain': bytes}
         conv_fun = type2fun.get(detected_type, None)
         if conv_fun:
             timedate_data = re.search(b'<time>([^T]+)T([^:]+:[^:]+):',
                                       type2fun[detected_type](data.content))
             date_data = timedate_data.group(1)
             time_data = timedate_data.group(2)
         else:
             time_data = b'notime'
         dirents.append('{0}_{1}_{2}{3}'.format(tid,
                                                date_data.decode('ascii'),
                                                time_data.decode('ascii'),
                                                extension))
     for r in dirents:
         yield r
Exemplo n.º 5
0
def get_content_type(response: 'Response') -> str:
    """Get content type from ``response``.

    Args:
        response (:class:`requests.Response`): Response object.

    Returns:
        The content type from ``response``.

    Note:
        If the ``Content-Type`` header is not defined in ``response``,
        the function will utilise |magic|_ to detect its content type.

    .. |Response| replace:: ``requests.Response``.
    .. _Response: https://requests.readthedocs.io/en/latest/api/index.html#requests.Response

    .. |magic| replace:: ``magic``
    .. _magic: https://pypi.org/project/python-magic/

    """
    ct_type = response.headers.get('Content-Type')
    if ct_type is None:
        try:
            ct_type = magic.detect_from_content(response.content).mime_type
        except Exception:
            ct_type = '(null)'
    return ct_type.casefold().split(';', maxsplit=1)[0].strip()
Exemplo n.º 6
0
def get_mimetype_and_encoding_for_content(content):
    """Function that returns the mime type and the encoding associated to
    a content buffer using the magic module under the hood.

    Args:
        content (bytes): a content buffer

    Returns:
        A tuple (mimetype, encoding), for instance ('text/plain', 'us-ascii'),
        associated to the provided content.

    """
    while True:
        try:
            magic_result = magic.detect_from_content(content)
            mime_type = magic_result.mime_type
            encoding = magic_result.encoding
            break
        except Exception:
            # workaround an issue with the magic module who can fail
            # if detect_from_content is called multiple times in
            # a short amount of time
            reload(magic)

    return mime_type, encoding
Exemplo n.º 7
0
def detect_local_source(path, content, mime_type=None, encoding=None):

    # TODO: may add sample_size

    filename = os.path.basename(path)
    parts = filename.split('.')
    extension = parts[-1] if len(parts) > 1 else None

    if magic is not None:
        detected = magic.detect_from_content(content)
        encoding = detected.encoding or encoding
        mime_name = detected.name
        mime_type = detected.mime_type or mime_type

    else:
        encoding = chardet.detect(content)['encoding'] or encoding
        mime_name = None
        mime_type = mime_type or mimetypes.guess_type(filename)[0]

    plugin_name = plugin_name_by_mime_type(mime_type, mime_name, extension)
    if encoding == 'binary':
        encoding = None

    return Source(uri=path,
                  plugin_name=plugin_name,
                  encoding=encoding)
Exemplo n.º 8
0
 def TypeOfBuffer(buf: bytearray, default: str = None) -> str:
     try:
         r = magic.detect_from_content(buf)
         return r.mime_type
     except:
         pass
     return default
Exemplo n.º 9
0
 def _getInstalledKernel(self):
     # Could we maybe remove the dependency for the "magic" module with a struct?
     # http://lxr.linux.no/#linux+v2.6.39/Documentation/x86/boot.txt
     # https://stackoverflow.com/a/11179559/733214
     try:
         len(self.cfg)
     except TypeError:
         raise RuntimeError('Tried to find the isKernel with no config set up and parsed')
     for f in self.cfg.findall('{0}fileChecks/{0}file'.format(self.ns)):
         isKernel = (True
                         if f.attrib.get('isKernel', 'false').lower() in ('true', '1')
                     else
                         False)
         if isKernel:
             self.kernelFile = f.text
     if self.kernelFile:
         with open(os.path.join('/boot', self.kernelFile), 'rb') as fh:
             magicname = magic.detect_from_content(fh.read())
         names = [i.strip().split(None, 1) for i in magicname.name.split(',') if i.strip() != '']
         for n in names:
             if len(n) != 2:
                 continue
             k, v = n
             # Note: this only grabs the version number.
             # If we want to get e.g. the build user/machine, date, etc.,
             # then we need to do a join. Shouldn't be necessary, though.
             if k.lower() == 'version':
                 self.installedKernVer = v.split(None, 1)[0]
     return()
Exemplo n.º 10
0
def download_file(uri, verify_ssl):
    response = requests.get(uri, verify=verify_ssl)
    content = response.content
    if magic is not None:
        encoding = magic.detect_from_content(content).encoding
    else:
        encoding = response.encoding

    # TODO: try to guess with uri.split('/')[-1].split('.')[-1].lower()
    # TODO: try to guess with file-magic lib
    try:
        content_type = response.headers['content-type']
        plugin_name = content_type.split('/')[-1].split(';')[0].lower()
    except (KeyError, IndexError):
        try:
            plugin_name = uri.split('/')[-1].split('.')[-1].lower()
        except IndexError:
            raise RuntimeError('Could not identify file type.')

    tmp = tempfile.NamedTemporaryFile()
    filename = '{}.{}'.format(tmp.name, plugin_name)
    tmp.close()
    with open(filename, 'wb') as fobj:
        fobj.write(content)

    return {'filename': filename, 'encoding': encoding, }
Exemplo n.º 11
0
 def test_detect_from_content(self):
     # differ from upstream by opening file in binary mode,
     # this avoids hitting a bug in python3+libfile bindings
     # see https://github.com/ahupp/python-magic/issues/152
     # for a similar issue
     with open(self.filename, 'rb') as fobj:
         result = magic.detect_from_content(fobj.read(4096))
     self.assert_result(result)
Exemplo n.º 12
0
def check_mime_type(data: bytes, valid_types: List[str]) -> bool:
    detected = magic.detect_from_content(data)
    mime_type = cast(str, detected.mime_type)

    if mime_type in valid_types:
        return True

    return False
Exemplo n.º 13
0
def get_buffer_mime_type(buffer):
    try:
        if hasattr(magic, 'detect_from_content'):
            # Using file-magic module: https://github.com/file/file
            return magic.detect_from_content(buffer[:128]).mime_type
        else:
            # Using python-magic module: https://github.com/ahupp/python-magic
            return magic.from_buffer(buffer[:128], mime=True)
    except Exception:
        return ''
Exemplo n.º 14
0
 def magic_type(self, data, isdata=False):
     try:
         if isdata:
             magictype = magic.detect_from_content(data[0:512]).name
         else:
             magictype = magic.detect_from_filename(data).name
     except NameError:
         magictype = 'Error - file-magic library required.'
     except Exception as e:
         magictype = 'Error getting magic type - %s' % e
     return magictype
Exemplo n.º 15
0
def guessWithMagic(content):
    result = magic.detect_from_content(content)
    return result.__dict__


# #worth to look into
# #https://bitbucket.org/Telofy/utilofies/csvprofiler/0d8cdc3ae5a0a08e7fb5906d96f0d8e2284751d1/utilofies/bslib.py?at=master#cl-15
# def intelligent_decode(fname):
#     """ One problem remains in the latest version of UnicodeDammit, namely
#         that pages that have beautifully declared encodings but contain one
#         small erroneous byte sequence somewhere will fail to be decoded with
#         the mostly correct encodings, while Windows-1252 somehow succeeds, but
#         completely mucks up all umlauts and ligatures. Hence I want to remove
#         Windows-1252 from the potential encodings.
#
#         I don't fall back on cchardet just yet.
#     """
#     detector = bs4.dammit.EncodingDetector(fname)
#     # Fall back on forcing it to UTF-8 only if no other encodings
#     # could be found. (I use override_encodings for the HTTP encoding,
#     # which seems at least less reliable to me than the declared encoding.)
#     potential_encodings = \
#         filter(bool, [detector.sniffed_encoding, detector.declared_encoding]
#                + list(detector.override_encodings)) \
#         or ['utf-8']
#     contains_replacement_characters = False
#     tried_encodings = []
#     unicode_markup = None
#     original_encoding = None
#     for encoding in potential_encodings:
#         tried_encodings.append(encoding)
#         try:
#             unicode_markup = detector.markup.decode(encoding)
#         except Exception as excp:
#             #logger.info('Unsuccessfully tried encoding %s: %r', encoding, excp)
#             print 'Unsuccessfully tried encoding %s: %r', encoding, excp
#         if unicode_markup is not None:
#             original_encoding = encoding
#             break
#     if unicode_markup is None:
#         # Whatever!
#         unicode_markup = detector.markup.decode(
#             potential_encodings[0], 'replace')
#         original_encoding = potential_encodings[0]
#         contains_replacement_characters = True
#     return type(b'MockDammit', (object,), {
#         'contains_replacement_characters': contains_replacement_characters,
#         'original_encoding': original_encoding,
#         'detector': detector,
#         'is_html': detector.is_html,
#         'markup': detector.markup,
#         'tried_encodings': tried_encodings,
#         'unicode_markup': unicode_markup})
Exemplo n.º 16
0
    def _index_content(self, fileid, path_to_file, mimetype, writer):
        """Index one file.
        """
        if not mimetype in EXTRACTORS:
            content = "Missing extractor for {}".format(mimetype)
        else:
            with open(path_to_file, 'rb') as f:
                document_bytes = f.read()
            magic = detect_from_content(document_bytes)

            content = EXTRACTORS[mimetype](path_to_file, document_bytes, magic)
        writer.add_document(fileid=fileid, content=content)
Exemplo n.º 17
0
def guessWithMagic(content):
    result = magic.detect_from_content(content)
    return result.__dict__


# #worth to look into
# #https://bitbucket.org/Telofy/utilofies/csvprofiler/0d8cdc3ae5a0a08e7fb5906d96f0d8e2284751d1/utilofies/bslib.py?at=master#cl-15
# def intelligent_decode(fname):
#     """ One problem remains in the latest version of UnicodeDammit, namely
#         that pages that have beautifully declared encodings but contain one
#         small erroneous byte sequence somewhere will fail to be decoded with
#         the mostly correct encodings, while Windows-1252 somehow succeeds, but
#         completely mucks up all umlauts and ligatures. Hence I want to remove
#         Windows-1252 from the potential encodings.
#
#         I don't fall back on cchardet just yet.
#     """
#     detector = bs4.dammit.EncodingDetector(fname)
#     # Fall back on forcing it to UTF-8 only if no other encodings
#     # could be found. (I use override_encodings for the HTTP encoding,
#     # which seems at least less reliable to me than the declared encoding.)
#     potential_encodings = \
#         filter(bool, [detector.sniffed_encoding, detector.declared_encoding]
#                + list(detector.override_encodings)) \
#         or ['utf-8']
#     contains_replacement_characters = False
#     tried_encodings = []
#     unicode_markup = None
#     original_encoding = None
#     for encoding in potential_encodings:
#         tried_encodings.append(encoding)
#         try:
#             unicode_markup = detector.markup.decode(encoding)
#         except Exception as excp:
#             #logger.info('Unsuccessfully tried encoding %s: %r', encoding, excp)
#             print 'Unsuccessfully tried encoding %s: %r', encoding, excp
#         if unicode_markup is not None:
#             original_encoding = encoding
#             break
#     if unicode_markup is None:
#         # Whatever!
#         unicode_markup = detector.markup.decode(
#             potential_encodings[0], 'replace')
#         original_encoding = potential_encodings[0]
#         contains_replacement_characters = True
#     return type(b'MockDammit', (object,), {
#         'contains_replacement_characters': contains_replacement_characters,
#         'original_encoding': original_encoding,
#         'detector': detector,
#         'is_html': detector.is_html,
#         'markup': detector.markup,
#         'tried_encodings': tried_encodings,
#         'unicode_markup': unicode_markup})
Exemplo n.º 18
0
 def get_file_mime(fobj) -> str:
     """
     :param fobj:
        - new/replace - django.core.files.uploadedfile.InMemoryUploadedFile (for newly created)
        - edit (w/o changing file itself) - django.core.files.base.File
     :return: file mimetype
     :note: https://stackoverflow.com/questions/4853581/django-get-uploaded-file-type-mimetype
     """
     pos = fobj.tell()
     fobj.seek(0)
     mime = magic.detect_from_content(fobj.read(1024)).mime_type
     fobj.seek(pos)
     return mime
Exemplo n.º 19
0
Arquivo: spec.py Projeto: cerha/pytis
    def constraint(value):
        if hasattr(magic, 'detect_from_content'):
            # Hack for temporary compatibility with both python 'magic' modules...
            mime_type = magic.detect_from_content(str(value)).mime_type
        else:
            mime_type = magic.from_buffer(str(value), mime=True)

        if mime_type in allowed_mime_types:
            return None
        else:
            return _("Detected data type %(detected)s. Expected %(expected)s.",
                     detected=mime_type,
                     expected=', '.join(allowed_mime_types))
Exemplo n.º 20
0
    def get_mime_type(data: bytes) -> typing.Union[typing.Tuple[str, str], typing.Tuple[None, None]]:
        """Get mime-type information based on the provided bytes object.

        Args:
            data: Binary data.

        Returns:
            typing.Tuple[str, str]: Identified mime information and mime-type. If **magic** is not available, returns *None, None*.
                                    E.g. *"ELF 64-bit LSB shared object, x86-64, version 1 (SYSV)", "application/x-sharedlib"*
        """
        if magic is None:
            return None, None

        detected = magic.detect_from_content(data)
        return detected.name, detected.mime_type
Exemplo n.º 21
0
def get_image_metadata_from_file(file_like):
    """
    Receive a valid image file and returns a 2-tuple of two strings:
        [0]: Image format (i.e. 'jpg', 'gif' or 'png')
        [1]: InMemoryUploadedFile-friendly save format (i.e. 'image/jpeg')
    image_format, in_memory_file_type
    """
    if hasattr(magic, 'from_buffer'):
        mime_type = magic.from_buffer(file_like.read(1024), mime=True)
    else:
        info = magic.detect_from_content(file_like.read(1024))
        mime_type = info.mime_type

    file_like.seek(0)
    image_format = MIME_TYPE_TO_PIL_IDENTIFIER[mime_type]
    return image_format, mime_type
Exemplo n.º 22
0
    def _analyze_file(self, path_to_file):
        content = open(path_to_file, 'rb').read()
        md5hash = hashlib.md5(content).hexdigest()
        magic = detect_from_content(content)
        filetype = magic.mime_type
        orig_name = os.path.basename(path_to_file)
        _, orig_ext = os.path.splitext(orig_name)
        if filetype == 'application/octet-stream' and magic.name == 'Microsoft OOXML':
            if orig_ext == '.pptx':
                filetype = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
            elif orig_ext == '.docx':
                filetype = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
            elif orig_ext == '.xlsx':
                filetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'

        return {'md5hash': md5hash, 'mimetype': filetype, 'text': orig_name}
Exemplo n.º 23
0
    def to_python(self, data):
        f = super().to_python(data)
        if f is None:
            return None

        if f.size > self.max_upload_size:
            raise forms.ValidationError(_('File is too big.'), code='size')

        content_type = magic.detect_from_content(f.read(1024)).mime_type
        if content_type not in self.content_types:
            raise forms.ValidationError(
                _('Filetype not supported.'), code='content_type'
            )
        f.seek(0)

        return f
Exemplo n.º 24
0
def get_magic_content_type(input):  # pylint: disable=redefined-builtin
    """Get content-type based on magic library as *bytes*

    As libmagic bindings are provided via several 'magic' packages, we try them in order
    """
    if magic is not None:
        if hasattr(input, 'seek'):
            input.seek(0)
        if hasattr(input, 'read'):
            input = input.read()
        if hasattr(magic, 'detect_from_content'):
            result = magic.detect_from_content(input)  # pylint: disable=no-member
            if result:
                return result.mime_type
        elif hasattr(magic, 'from_buffer'):
            return magic.from_buffer(input, mime=True)
    return None
Exemplo n.º 25
0
def get_file_kernel_ver(kpath):
    # Gets the version of a kernel file.
    kpath = os.path.abspath(os.path.expanduser(kpath))
    _kinfo = {}
    with open(kpath, 'rb') as f:
        _m = magic.detect_from_content(f.read())
    for i in _m.name.split(','):
        l = i.strip().split()
        # Note: this only grabs the version number.
        # If we want to get e.g. the build user/machine, date, etc.,
        # then we need to join l[1:].
        # We technically don't even need a dict, either. We can just iterate.
        # TODO.
        _kinfo[l[0].lower()] = (l[1] if len(l) > 1 else None)
    if 'version' not in _kinfo:
        raise RuntimeError(
            'Cannot deterimine the version of {0}'.format(kpath))
    else:
        return (_kinfo['version'])
Exemplo n.º 26
0
def _file_magic(file: File) -> FileMagic:
    """Returns the file magic namedtuple from the respective file."""

    if isinstance(file, bytes):
        return detect_from_content(file[:1024])  # Fix issue #350.

    if isinstance(file, str):
        return _file_magic(Path(file))

    if isinstance(file, Path):
        if file.is_file():
            return detect_from_filename(str(file))

        raise FileNotFoundError(str(file))

    if isinstance(file, FILE_LIKE_OBJECTS):
        return detect_from_fobj(file)

    raise TypeError(f'Cannot read MIME type from {type(file)}.')
Exemplo n.º 27
0
def extract_images(child, namespaces, start=0):
    "Extract draw:image with binary-data and replace by href"
    import magic
    images = []
    for i, image in enumerate(
            child.xpath('//draw:image', namespaces=namespaces), start):
        binary_data, = image.xpath('./office:binary-data',
                                   namespaces=namespaces)
        data = base64.b64decode(binary_data.text)
        if hasattr(magic, 'from_buffer'):
            mime_type = magic.from_buffer(data, mime=True)
        else:
            # Not python-magic but file-magic
            mime_type = magic.detect_from_content(data).mime_type
        name = 'Pictures/image%s%s' % (i, mimetypes.guess_extension(mime_type))
        image.remove(binary_data)
        xlink_ns = namespaces['xlink']
        image.attrib['{%s}href' % xlink_ns] = name
        images.append((name, data, mime_type))
    return images
Exemplo n.º 28
0
def get_magics(path):
    """Get file format and encoding.

    The magic library is not really good at detecting text file-based format
    like CSV, JSON, YAML or, XML so we only use it to detect binary format and
    the encoding.

    Support both file-magic and python-magic as both are shipped under the same
    name in various distributions.

    """
    if not MAGIC:
        return None, None

    with open(path, 'rb') as file:
        header = file.read(2048)

    if hasattr(magic, 'detect_from_content'):  # file-magic
        res = magic.detect_from_content(header)
        mime_type = res.mime_type
        magic_name = res.name
    else:  # python-magic
        mime_type = magic.from_buffer(header, mime=True)
        magic_name = magic.from_buffer(header)

    mime_to_format = {
        'application/pgp': 'gpg',
        'application/x-sqlite3': 'sqlite3'
    }
    name_to_format = {'KDBX': 'kdbx', 'openssl': 'openssl', 'PGP': 'gpg'}

    frmt = mime_to_format.get(mime_type, None)
    for name, plain_format in name_to_format.items():
        if name in magic_name:
            frmt = plain_format

    encoding = None
    if 'UTF-8 Unicode (with BOM)' in magic_name:
        encoding = 'utf-8-sig'

    return frmt, encoding
Exemplo n.º 29
0
def extract_dir(data, start, outdir, file_number=65536, dir_number=65536):
    global header_fmt, header_len

    orig_start = start
    while (file_number > 0 or dir_number > 0) and start < len(data):
        header = struct.unpack(header_fmt, data[start:start + header_len])
        start += header_len

        filename = data[start:start + header[0]]
        if b'\x00' in filename:
            # this means end too
            break
        filename = filename.decode("ascii")
        start += header[0]

        if magic.detect_from_content(
                data[start:start + header[3]]).mime_type != "application/zlib":
            # dir
            if dir_number == 0:
                raise Exception(
                    "invalid directory number for directory \"%s\"" % (outdir))
            n_file_number = header[3] // 0x10000
            n_dir_number = header[3] & 0xffff
            n_path = os.path.join(outdir, filename)
            os.mkdir(n_path)
            start += extract_dir(data, start, n_path, n_file_number,
                                 n_dir_number)
            dir_number -= 1
        else:
            # file
            if file_number == 0:
                raise Exception("invalid file number for directory \"%s\"" %
                                (outdir))
            filedata = data[start:start + header[3]]
            start += header[3]
            with open(os.path.join(outdir, filename), "wb") as f:
                f.write(zlib.decompress(filedata))
            file_number -= 1

    return start - orig_start
Exemplo n.º 30
0
def get_file_type(filename_or_file):
    """
	Get mime_type and encoding of file `filename_or_file`.

	Handles both magic libraries.

	:param filename_or_file: filename or open file
	:type filename_or_file: str or file
	:return: mime_type and encoding of `filename_or_file`
	:rtype: FileType
	"""
    if hasattr(filename_or_file, 'seek'):
        old_pos = filename_or_file.tell()
        txt = filename_or_file.read()
        filename_or_file.seek(old_pos)
    elif isinstance(filename_or_file, string_types):
        with open(filename_or_file, 'rb') as fp:
            txt = fp.read()
    else:
        raise ValueError(
            'Argument "filename_or_file" has unknown type {!r}.'.format(
                type(filename_or_file)))
    if hasattr(magic, 'from_file'):
        mime = magic.Magic(mime=True, mime_encoding=True).from_buffer(txt)
        mime_type, charset = mime.split(';')
        encoding = charset.split('=')[-1]
        text = magic.Magic().from_buffer(txt)
    elif hasattr(magic, 'detect_from_filename'):
        fm = magic.detect_from_content(txt)
        mime_type = fm.mime_type
        encoding = fm.encoding
        text = fm.name
    else:
        raise RuntimeError('Unknown version or type of "magic" library.')
    # auto detect utf-8 with BOM
    if encoding == 'utf-8' and txt.startswith(codecs.BOM_UTF8):
        encoding = 'utf-8-sig'
    return FileType(mime_type, encoding, text)
Exemplo n.º 31
0
 def decompressor(self):
     # TODO: use mime module as fallback?
     # https://docs.python.org/3/library/mimetypes.html
     # VERY less-than-ideal since it won't work without self.args['logfile']
     # (and has iffy detection at best, since it relies on file extensions).
     # Determine what decompressor to use, if we need to.
     if has_magic:
         _mime = magic.detect_from_content(self.data).mime_type
         self.decompress = cmprsn_map[_mime]
         if self.decompress:
             import importlib
             decmp = importlib.import_module(self.decompress)
             self.raw = decmp.decompress(self.data)
     else:
         # Assume that it's text and that it isn't compressed.
         # We'll get a UnicodeDecodeError exception if it isn't.
         pass
     try:
         self.raw = self.data.decode('utf-8')
     except UnicodeDecodeError:
         pass
     self.data = self.raw
     return ()
Exemplo n.º 32
0
 def detect_ext(self, data):
     import magic
     fmagic = magic.detect_from_content(data)
     if fmagic.mime_type.startswith('text/'):
         if fmagic.encoding == 'unknown-8bit':
             ext = '.bin'
         else:
             text = data.decode(fmagic.encoding)
             if '@return' in text or '*start' in text or '.ks' in text or '[w]' in text:
                 ext = '.ks'
             elif '.tjs' in text or '%[' in text or '];' in text:
                 ext = '.tjs'
             else:
                 ext = '.txt'
     else:
         ext = mimetypes.guess_extension(fmagic.mime_type)
         if ext == '.jpeg':
             ext = '.jpg'
         elif ext == '.oga':
             ext = '.ogg'
         elif ext == '.asf':
             ext = '.wmv'
     return ext
Exemplo n.º 33
0
def extract_dir(data, start, outdir, file_number=65536, dir_number=65536):
    global header_fmt, header_len

    orig_start = start
    while (file_number > 0 or dir_number > 0) and start < len(data):
        header = struct.unpack(header_fmt, data[start:start+header_len])
        start += header_len

        filename = data[start:start+header[0]]
        if b'\x00' in filename:
            # this means end too
            break
        filename = filename.decode("ascii")
        start += header[0]

        if magic.detect_from_content(data[start:start+header[3]]).mime_type != "application/zlib":
            # dir
            if dir_number == 0:
                raise Exception("invalid directory number for directory \"%s\"" % (outdir))
            n_file_number = header[3] // 0x10000
            n_dir_number = header[3] & 0xffff
            n_path = os.path.join(outdir, filename)
            os.mkdir(n_path)
            start += extract_dir(data, start, n_path, n_file_number, n_dir_number)
            dir_number -= 1
        else:
            # file
            if file_number == 0:
                raise Exception("invalid file number for directory \"%s\"" % (outdir))
            filedata = data[start:start+header[3]]
            start += header[3]
            with open(os.path.join(outdir, filename), "wb") as f:
                f.write(zlib.decompress(filedata))
            file_number -= 1

    return start - orig_start
Exemplo n.º 34
0
Arquivo: tests.py Projeto: 0mp/freebsd
 def test_detect_from_content(self):
     with open(self.filename) as fobj:
         result = magic.detect_from_content(fobj.read(4096))
     self.assert_result(result)
Exemplo n.º 35
0
def get_content_mimetype(content: bytes) -> str:
    """ MIME Type of content retrieved from magic headers """

    detected_mime = magic.detect_from_content(content).mime_type
    return MIME_OVERRIDES.get(detected_mime, detected_mime)