Пример #1
0
def create_dataset_series(path: str, preprocess: Callable[[str],
                                                          str]) -> Iterable:
    """Create dataset series.

    Arguments:
        path: The path of the file with the data
        preprocess: Preprocessor function

    Returns:
        The dataset series.
    """
    filetyper = magic.Magic(mime=True)
    filetyper.flags |= magic.MAGIC_SYMLINK
    filetyper.cookie = magic.magic_open(filetyper.flags)
    magic.magic_load(filetyper.cookie, None)

    log("Loading {}".format(path))
    file_type = filetyper.from_file(path)

    if file_type.startswith('text/'):
        reader = PlainTextFileReader(path)
        for line in reader.read():
            yield preprocess(line)
    elif file_type == 'application/gzip' or file_type == 'application/x-gzip':
        gzreader = GZipReader(path)
        for line in gzreader.read():
            yield preprocess(line)
    elif file_type == 'application/octet-stream':
        return np.load(path)
    else:
        raise Exception("Unsupported data type: {}, file {}".format(
            file_type, path))
Пример #2
0
    def _load_magic_file(self):
        self.magic_file = ':'.join(
            (constants.MAGIC_RULE_PATH, '/usr/share/file/magic.mgc'))

        if self.use_cache:
            self.log.info("Checking for custom magic file...")
            with get_cachestore('system',
                                config=self.config,
                                datastore=self.datastore) as cache:
                try:
                    custom_magic = "/tmp/custom.magic"
                    cache.download('custom_magic', custom_magic)
                    self.magic_file = ':'.join(
                        (custom_magic, '/usr/share/file/magic.mgc'))
                    self.log.info("Custom magic file loaded!")
                except FileStoreException:
                    self.log.info("No custom magic file found.")

        with self.lock:
            self.file_type = magic.magic_open(magic.MAGIC_CONTINUE +
                                              magic.MAGIC_RAW)
            magic.magic_load(self.file_type, self.magic_file)

            self.mime_type = magic.magic_open(magic.MAGIC_CONTINUE +
                                              magic.MAGIC_RAW +
                                              magic.MAGIC_MIME)
            magic.magic_load(self.mime_type, self.magic_file)
Пример #3
0
def get_file_mime_type(name):
    m = magic.Magic()
    m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME
                                | magic.MAGIC_MIME_ENCODING)
    magic.magic_load(m.cookie, None)
    filetype = m.from_file(name)
    return filetype
Пример #4
0
def get_mime_type(content):
    # try to fixup encoding
    if hasattr(magic, "open"):
        m = magic.open(magic.MAGIC_MIME)
        m.load()
        filetype = m.buffer(content)
    else:
        m = magic.Magic()
        m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME
                                    | magic.MAGIC_MIME_ENCODING)
        magic.magic_load(m.cookie, None)
        filetype = m.from_buffer(content)

    return filetype.split('; ', 1)
Пример #5
0
def get_cleaned_text_file_content(uploaded_file):
    """Read uploaded file, try to fix up encoding to UTF-8 and
    transform line endings into Unix style, then return the content as
    a UTF-8 string. Errors are reported as
    django.core.exceptions.ValidationError exceptions."""

    if not uploaded_file:
        return u""

    if uploaded_file.size and uploaded_file.size > 10 * 1000 * 1000:
        raise ValidationError("Text file too large (size %s)." %
                              uploaded_file.size)

    content = "".join(uploaded_file.chunks())

    # try to fixup encoding
    import magic
    if hasattr(magic, "open"):
        m = magic.open(magic.MAGIC_MIME)
        m.load()
        filetype = m.buffer(content)
    else:
        m = magic.Magic()
        m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME
                                    | magic.MAGIC_MIME_ENCODING)
        magic.magic_load(m.cookie, None)
        filetype = m.from_buffer(content)

    if not filetype.startswith("text"):
        raise ValidationError(
            "Uploaded file does not appear to be a text file.")

    match = re.search("charset=([\w-]+)", filetype)
    if not match:
        raise ValidationError("File has unknown encoding.")

    encoding = match.group(1)
    if "ascii" not in encoding:
        try:
            content = content.decode(encoding)
        except Exception as e:
            raise ValidationError(
                "Error decoding file (%s). Try submitting with UTF-8 encoding or remove non-ASCII characters."
                % str(e))

    # turn line-endings into Unix style
    content = content.replace("\r\n", "\n").replace("\r", "\n")

    return content.encode("utf-8")
Пример #6
0
 def parse_file_charset(self):
     import magic
     self.fd.file.seek(0)
     content = self.fd.file.read(4096)
     if hasattr(magic, "open"):
         m = magic.open(magic.MAGIC_MIME)
         m.load()
         filetype = m.buffer(content)
     else:
         m = magic.Magic()
         m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING)
         magic.magic_load(m.cookie, None)
         filetype = m.from_buffer(content)
     if not 'ascii' in filetype:
         self.parsed_info.add_error('A plain text ASCII document must be submitted.')
Пример #7
0
def put_identify_custom_magic_file(**_):
    """
    Save a new version of identify's custom LibMagic file

    Variables:
    None

    Arguments:
    None

    Data Block:
    <current custom.magic file>

    Result example:
    {"success": True}
    """
    data = request.json.encode('utf-8')

    magic_file = None
    try:
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            magic_file = tmp.name
            tmp.write(data)

        try:
            test = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_RAW)
            magic.magic_load(test, magic_file)
        except magic.MagicException:
            return make_api_response(
                {'success': False},
                "The magic file you have submitted is invalid.", 400)
    finally:
        if magic_file and os.path.exists(magic_file):
            os.unlink(magic_file)

    with forge.get_cachestore('system', config=config,
                              datastore=STORAGE) as cache:
        if hashlib.sha256(data).hexdigest() == get_sha256_for_file(
                constants.MAGIC_RULE_PATH):
            cache.delete('custom_magic')
        else:
            cache.save('custom_magic', data, ttl=ADMIN_FILE_TTL, force=True)

    # Notify components watching to reload magic file
    event_sender.send('identify', 'magic')

    return make_api_response({'success': True})
Пример #8
0
def calc_magic(stream):
    # Missing python-magic features:
    # - magic_descriptor (https://github.com/ahupp/python-magic/pull/227)
    # - direct support for symlink flag
    magic_cookie = magic.magic_open(magic.MAGIC_SYMLINK)
    magic.magic_load(magic_cookie, None)
    try:
        fd_path = get_fd_path(stream)
        if fd_path:
            return magic.maybe_decode(magic.magic_file(magic_cookie, fd_path))
        else:
            # Handle BytesIO in-memory streams
            stream.seek(0, os.SEEK_SET)
            return magic.maybe_decode(magic.magic_buffer(magic_cookie, stream.read()))
    finally:
        magic.magic_close(magic_cookie)
    return None
Пример #9
0
 def parse_file_charset(self):
     import magic
     self.fd.file.seek(0)
     content = self.fd.file.read(4096)
     if hasattr(magic, "open"):
         m = magic.open(magic.MAGIC_MIME)
         m.load()
         filetype = m.buffer(content)
     else:
         m = magic.Magic()
         m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME
                                     | magic.MAGIC_MIME_ENCODING)
         magic.magic_load(m.cookie, None)
         filetype = m.from_buffer(content)
     if not 'ascii' in filetype:
         self.parsed_info.add_error(
             'A plain text document must be submitted.')
Пример #10
0
 def parse_file_charset(self):
     import magic
     self.fd.file.seek(0)
     content = self.fd.file.read()
     if hasattr(magic, "open"):
         m = magic.open(magic.MAGIC_MIME)
         m.load()
         filetype = m.buffer(content)
     else:
         m = magic.Magic()
         m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING)
         magic.magic_load(m.cookie, None)
         filetype = m.from_buffer(content)
     if not 'ascii' in filetype and not 'utf-8' in filetype:
         self.parsed_info.add_error('A plain text ASCII document is required.  '
             'Found an unexpected encoding: "%s".  '
             'You probably have one or more non-ascii characters in your file.'  % filetype
         )
Пример #11
0
def get_cleaned_text_file_content(uploaded_file):
    """Read uploaded file, try to fix up encoding to UTF-8 and
    transform line endings into Unix style, then return the content as
    a UTF-8 string. Errors are reported as
    django.forms.ValidationError exceptions."""

    if not uploaded_file:
        return u""

    if uploaded_file.size and uploaded_file.size > 10 * 1000 * 1000:
        raise django.forms.ValidationError("Text file too large (size %s)." % uploaded_file.size)

    content = "".join(uploaded_file.chunks())

    # try to fixup encoding
    import magic
    if hasattr(magic, "open"):
        m = magic.open(magic.MAGIC_MIME)
        m.load()
        filetype = m.buffer(content)
    else:
        m = magic.Magic()
        m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING)
        magic.magic_load(m.cookie, None)
        filetype = m.from_buffer(content)

    if not filetype.startswith("text"):
        raise django.forms.ValidationError("Uploaded file does not appear to be a text file.")

    match = re.search("charset=([\w-]+)", filetype)
    if not match:
        raise django.forms.ValidationError("File has unknown encoding.")

    encoding = match.group(1)
    if "ascii" not in encoding:
        try:
            content = content.decode(encoding)
        except Exception as e:
            raise django.forms.ValidationError("Error decoding file (%s). Try submitting with UTF-8 encoding or remove non-ASCII characters." % str(e))

    # turn line-endings into Unix style
    content = content.replace("\r\n", "\n").replace("\r", "\n")

    return content.encode("utf-8")
Пример #12
0
from typing import List, Any, Callable
import magic

# tests: lint, mypy

# pylint: disable=invalid-name
Reader = Callable[[List[str]], Any]

FILETYPER = magic.Magic(mime=True)
FILETYPER.flags |= magic.MAGIC_SYMLINK
FILETYPER.cookie = magic.magic_open(FILETYPER.flags)
magic.magic_load(FILETYPER.cookie, None)
Пример #13
0
custom = re.compile(r'^custom: ', re.IGNORECASE)

ssdeep_from_file = None

magic_lock = None
file_type = None
mime_type = None

if platform.system() != 'Windows':
    import magic

    magic_lock = threading.Lock()

    file_type = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_RAW)
    magic.magic_load(file_type, constants.RULE_PATH)

    mime_type = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_MIME)
    magic.magic_load(mime_type, constants.RULE_PATH)

    try:
        # noinspection PyUnresolvedReferences
        import ssdeep  # ssdeep requires apt-get cython and pip install ssdeep
        ssdeep_from_file = ssdeep.hash_from_file
    except ImportError:
        pass  # ssdeep_from_file will be None if we fail to import ssdeep.


# Translate the match object into a sub-type label.
def subtype(label):
    for entry in sl_patterns:
Пример #14
0
 def __init__(self,magic_file):
     self.cookie = magic.magic_open(magic.MAGIC_COMPRESS|magic.MAGIC_MIME|magic.MAGIC_CONTINUE|magic.MAGIC_PRESERVE_ATIME|magic.MAGIC_ERROR|magic.MAGIC_MIME_ENCODING)
     # lolololo
     magic.magic_load(self.cookie,magic_file.encode('utf-8'))
     self.thread = threading.currentThread()