def create_dataset_series(path: str, preprocess: Callable[[str], str]) -> Iterable: """Create dataset series. Arguments: path: The path of the file with the data preprocess: Preprocessor function Returns: The dataset series. """ filetyper = magic.Magic(mime=True) filetyper.flags |= magic.MAGIC_SYMLINK filetyper.cookie = magic.magic_open(filetyper.flags) magic.magic_load(filetyper.cookie, None) log("Loading {}".format(path)) file_type = filetyper.from_file(path) if file_type.startswith('text/'): reader = PlainTextFileReader(path) for line in reader.read(): yield preprocess(line) elif file_type == 'application/gzip' or file_type == 'application/x-gzip': gzreader = GZipReader(path) for line in gzreader.read(): yield preprocess(line) elif file_type == 'application/octet-stream': return np.load(path) else: raise Exception("Unsupported data type: {}, file {}".format( file_type, path))
def _load_magic_file(self): self.magic_file = ':'.join( (constants.MAGIC_RULE_PATH, '/usr/share/file/magic.mgc')) if self.use_cache: self.log.info("Checking for custom magic file...") with get_cachestore('system', config=self.config, datastore=self.datastore) as cache: try: custom_magic = "/tmp/custom.magic" cache.download('custom_magic', custom_magic) self.magic_file = ':'.join( (custom_magic, '/usr/share/file/magic.mgc')) self.log.info("Custom magic file loaded!") except FileStoreException: self.log.info("No custom magic file found.") with self.lock: self.file_type = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_RAW) magic.magic_load(self.file_type, self.magic_file) self.mime_type = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_RAW + magic.MAGIC_MIME) magic.magic_load(self.mime_type, self.magic_file)
def get_file_mime_type(name): m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_file(name) return filetype
def get_mime_type(content): # try to fixup encoding if hasattr(magic, "open"): m = magic.open(magic.MAGIC_MIME) m.load() filetype = m.buffer(content) else: m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_buffer(content) return filetype.split('; ', 1)
def get_cleaned_text_file_content(uploaded_file): """Read uploaded file, try to fix up encoding to UTF-8 and transform line endings into Unix style, then return the content as a UTF-8 string. Errors are reported as django.core.exceptions.ValidationError exceptions.""" if not uploaded_file: return u"" if uploaded_file.size and uploaded_file.size > 10 * 1000 * 1000: raise ValidationError("Text file too large (size %s)." % uploaded_file.size) content = "".join(uploaded_file.chunks()) # try to fixup encoding import magic if hasattr(magic, "open"): m = magic.open(magic.MAGIC_MIME) m.load() filetype = m.buffer(content) else: m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_buffer(content) if not filetype.startswith("text"): raise ValidationError( "Uploaded file does not appear to be a text file.") match = re.search("charset=([\w-]+)", filetype) if not match: raise ValidationError("File has unknown encoding.") encoding = match.group(1) if "ascii" not in encoding: try: content = content.decode(encoding) except Exception as e: raise ValidationError( "Error decoding file (%s). Try submitting with UTF-8 encoding or remove non-ASCII characters." % str(e)) # turn line-endings into Unix style content = content.replace("\r\n", "\n").replace("\r", "\n") return content.encode("utf-8")
def parse_file_charset(self): import magic self.fd.file.seek(0) content = self.fd.file.read(4096) if hasattr(magic, "open"): m = magic.open(magic.MAGIC_MIME) m.load() filetype = m.buffer(content) else: m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_buffer(content) if not 'ascii' in filetype: self.parsed_info.add_error('A plain text ASCII document must be submitted.')
def put_identify_custom_magic_file(**_): """ Save a new version of identify's custom LibMagic file Variables: None Arguments: None Data Block: <current custom.magic file> Result example: {"success": True} """ data = request.json.encode('utf-8') magic_file = None try: with tempfile.NamedTemporaryFile(delete=False) as tmp: magic_file = tmp.name tmp.write(data) try: test = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_RAW) magic.magic_load(test, magic_file) except magic.MagicException: return make_api_response( {'success': False}, "The magic file you have submitted is invalid.", 400) finally: if magic_file and os.path.exists(magic_file): os.unlink(magic_file) with forge.get_cachestore('system', config=config, datastore=STORAGE) as cache: if hashlib.sha256(data).hexdigest() == get_sha256_for_file( constants.MAGIC_RULE_PATH): cache.delete('custom_magic') else: cache.save('custom_magic', data, ttl=ADMIN_FILE_TTL, force=True) # Notify components watching to reload magic file event_sender.send('identify', 'magic') return make_api_response({'success': True})
def calc_magic(stream): # Missing python-magic features: # - magic_descriptor (https://github.com/ahupp/python-magic/pull/227) # - direct support for symlink flag magic_cookie = magic.magic_open(magic.MAGIC_SYMLINK) magic.magic_load(magic_cookie, None) try: fd_path = get_fd_path(stream) if fd_path: return magic.maybe_decode(magic.magic_file(magic_cookie, fd_path)) else: # Handle BytesIO in-memory streams stream.seek(0, os.SEEK_SET) return magic.maybe_decode(magic.magic_buffer(magic_cookie, stream.read())) finally: magic.magic_close(magic_cookie) return None
def parse_file_charset(self): import magic self.fd.file.seek(0) content = self.fd.file.read(4096) if hasattr(magic, "open"): m = magic.open(magic.MAGIC_MIME) m.load() filetype = m.buffer(content) else: m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_buffer(content) if not 'ascii' in filetype: self.parsed_info.add_error( 'A plain text document must be submitted.')
def parse_file_charset(self): import magic self.fd.file.seek(0) content = self.fd.file.read() if hasattr(magic, "open"): m = magic.open(magic.MAGIC_MIME) m.load() filetype = m.buffer(content) else: m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_buffer(content) if not 'ascii' in filetype and not 'utf-8' in filetype: self.parsed_info.add_error('A plain text ASCII document is required. ' 'Found an unexpected encoding: "%s". ' 'You probably have one or more non-ascii characters in your file.' % filetype )
def get_cleaned_text_file_content(uploaded_file): """Read uploaded file, try to fix up encoding to UTF-8 and transform line endings into Unix style, then return the content as a UTF-8 string. Errors are reported as django.forms.ValidationError exceptions.""" if not uploaded_file: return u"" if uploaded_file.size and uploaded_file.size > 10 * 1000 * 1000: raise django.forms.ValidationError("Text file too large (size %s)." % uploaded_file.size) content = "".join(uploaded_file.chunks()) # try to fixup encoding import magic if hasattr(magic, "open"): m = magic.open(magic.MAGIC_MIME) m.load() filetype = m.buffer(content) else: m = magic.Magic() m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) magic.magic_load(m.cookie, None) filetype = m.from_buffer(content) if not filetype.startswith("text"): raise django.forms.ValidationError("Uploaded file does not appear to be a text file.") match = re.search("charset=([\w-]+)", filetype) if not match: raise django.forms.ValidationError("File has unknown encoding.") encoding = match.group(1) if "ascii" not in encoding: try: content = content.decode(encoding) except Exception as e: raise django.forms.ValidationError("Error decoding file (%s). Try submitting with UTF-8 encoding or remove non-ASCII characters." % str(e)) # turn line-endings into Unix style content = content.replace("\r\n", "\n").replace("\r", "\n") return content.encode("utf-8")
from typing import List, Any, Callable import magic # tests: lint, mypy # pylint: disable=invalid-name Reader = Callable[[List[str]], Any] FILETYPER = magic.Magic(mime=True) FILETYPER.flags |= magic.MAGIC_SYMLINK FILETYPER.cookie = magic.magic_open(FILETYPER.flags) magic.magic_load(FILETYPER.cookie, None)
custom = re.compile(r'^custom: ', re.IGNORECASE) ssdeep_from_file = None magic_lock = None file_type = None mime_type = None if platform.system() != 'Windows': import magic magic_lock = threading.Lock() file_type = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_RAW) magic.magic_load(file_type, constants.RULE_PATH) mime_type = magic.magic_open(magic.MAGIC_CONTINUE + magic.MAGIC_MIME) magic.magic_load(mime_type, constants.RULE_PATH) try: # noinspection PyUnresolvedReferences import ssdeep # ssdeep requires apt-get cython and pip install ssdeep ssdeep_from_file = ssdeep.hash_from_file except ImportError: pass # ssdeep_from_file will be None if we fail to import ssdeep. # Translate the match object into a sub-type label. def subtype(label): for entry in sl_patterns:
def __init__(self,magic_file): self.cookie = magic.magic_open(magic.MAGIC_COMPRESS|magic.MAGIC_MIME|magic.MAGIC_CONTINUE|magic.MAGIC_PRESERVE_ATIME|magic.MAGIC_ERROR|magic.MAGIC_MIME_ENCODING) # lolololo magic.magic_load(self.cookie,magic_file.encode('utf-8')) self.thread = threading.currentThread()