def process(self, data): context = dict(metavars(data)) seed = self.args.seed if isinstance(seed, str): seed = PythonExpression(seed, 'N', constants=metavars(data)) if callable(seed): seed = seed(context, N=len(data)) self._index.init(self.fmask) prologue = self.args.prologue.expression epilogue = self.args.epilogue.expression operator = self.args.operator.expression context.update(N=len(data), S=seed) def operate(block, index, *args): context.update(I=index, B=block, V=args) if args: context['A'] = args[0] context['S'] = eval(prologue, None, context) context['B'] = eval(operator, None, context) context['S'] = eval(epilogue, None, context) return context['B'] placeholder = self.operate self.operate = operate result = super().process(data) self.operate = placeholder return result
def process(self, data): if not self.exhausted: if self._paste: import codecs self._pyperclip.copy( codecs.decode(data, self.codec, errors='backslashreplace')) elif not self.stream: # This should happen only when the unit is called from Python code # rather than via the command line. try: path = next(self.paths) except StopIteration: raise RefineryCriticalException( 'the list of filenames was exhausted.') else: with self._open(path) as stream: stream.write(data) else: self.stream.write(data) self.log_debug(F'wrote 0x{len(data):08X} bytes') self._close() forward_input_data = self.args.tee else: forward_input_data = self.args.tee or not self.isatty if not forward_input_data: meta = metavars(data) size = meta['size'] self.log_warn( F'discarding unprocessed chunk of size {size!s}.') if forward_input_data: yield data
def filter(self, chunks): if self.exhausted: self._reset() if self._paste: it = iter(chunks) yield next(it) self.exhausted = True yield from it return chunkmode = not self.args.stream for index, chunk in enumerate(chunks, 0): if not chunk.visible: continue if not self.exhausted and (chunkmode or not self.stream): try: path = next(self.paths) except StopIteration: self.exhausted = True else: if self._has_format(path): meta = metavars(chunk, ghost=True) meta['index'] = index path = meta.format_str(path, self.codec, [chunk]) self.stream = self._open(path) yield chunk self._close(final=True) self.exhausted = True
def reverse(self, data): meta = metavars(data) path = meta.get('path', None) name = path and pathlib.Path(path).name with MemoryFile(data) as stream: with MemoryFile() as output: uu.encode(stream, output, name, backtick=True) return output.getvalue()
def process(self, data): pad = self.args.indent * ' ' etm = {} try: dom = ForgivingParse(data, etm) except Exception: from refinery.lib.meta import metavars msg = 'error parsing as XML, returning original content' path = metavars(data).get('path') if path: msg = F'{msg}: {path}' self.log_warn(msg) return data def indent(element, level=0, more_sibs=False): """ The credit for this one goes to: https://stackoverflow.com/a/12940014 """ indentation = '\n' if level: indentation += (level - 1) * pad childcount = len(element) if childcount: if not element.text or not element.text.strip(): element.text = indentation + pad if level: element.text += pad for count, child in enumerate(element): indent(child, level + 1, count < childcount - 1) if level and (not element.tail or element.tail.isspace()): element.tail = indentation if more_sibs: element.tail += pad elif level and (not element.tail or element.tail.isspace()): element.tail = indentation if more_sibs: element.tail += pad indent(dom.getroot()) with io.BytesIO() as output: dom.write(output, encoding=self.codec, xml_declaration=self.args.header) result = output.getvalue() for uid, key in etm.items(): entity = F'&{key};'.encode(self.codec) needle = uid.encode(self.codec) result = result.replace(needle, entity) return result
def process(self, data): def repl(match: Match): return meta.format_bin(spec, self.codec, [match[0], *match.groups()], match.groupdict()) self.log_info('pattern:', self.regex) self.log_info('replace:', self.args.subst) meta = metavars(data) spec = self.args.subst.decode('ascii', 'backslashreplace') substitute = self.regex.sub if self.args.count: from functools import partial substitute = partial(substitute, count=self.args.count) return substitute(repl, data)
def filter(self, chunks): names = self.args.names reset = self.args.reset for index, chunk in enumerate(chunks): chunk: Chunk if not chunk.visible: yield chunk continue meta = metavars(chunk) if reset: chunk.meta.clear() if 'index' in names: meta['index'] = index for name in names: chunk[name] = meta[name] yield chunk
def process(self, data): meta = metavars(data, ghost=True) for mask in self.args.filenames: mask = meta.format_str(mask, self.codec, [data]) self.log_debug('scanning for mask:', mask) kwargs = dict() for path in self._glob(mask): if not path.is_file(): continue if self.args.meta: stat = path.stat() kwargs.update(size=stat.st_size, atime=datetime.fromtimestamp( stat.st_atime).isoformat(' ', 'seconds'), ctime=datetime.fromtimestamp( stat.st_ctime).isoformat(' ', 'seconds'), mtime=datetime.fromtimestamp( stat.st_mtime).isoformat(' ', 'seconds')) if self.args.list: try: yield self.labelled( str(path).encode(self.codec), **kwargs) except OSError: self.log_warn(F'os error while scanning: {path!s}') continue try: with path.open('rb') as stream: if self.args.linewise: yield from self._read_lines(stream) elif self.args.size: yield from self._read_chunks(stream) else: data = stream.read() self.log_info( lambda: F'reading: {path!s} ({len(data)} bytes)') yield self.labelled(data, path=path.as_posix(), **kwargs) except PermissionError: self.log_warn('permission denied:', path.as_posix()) except FileNotFoundError: self.log_warn('file is missing:', path.as_posix()) except Exception: self.log_warn('unknown error while reading:', path.as_posix())
def process(self, data): meta = metavars(data, ghost=True) args = [data] variable = self.args.variable if self.args.binary: formatter = partial(meta.format_bin, codec=self.codec, args=args) else: def formatter(spec): return meta.format_str(spec, self.codec, args, escaped=True).encode(self.codec) for spec in self.args.formats: result = formatter(spec) if variable is not None: result = self.labelled(data, **{variable: result}) yield result
def match(self, chunk): meta = metavars(chunk) lhs: Optional[str] = self.args.lhs rhs: Optional[Any] = self.args.rhs cmp: Optional[Callable[[Any, Any], bool]] = self.args.cmp try: lhs = lhs and PythonExpression.evaluate(lhs, meta) except ParserVariableMissing: return lhs in meta if cmp is None and rhs is not None: rhs = DelayedNumSeqArgument(rhs)(chunk) return lhs == rhs try: rhs = rhs and PythonExpression.evaluate(rhs, meta) except ParserVariableMissing: raise except Exception: rhs = rhs.encode(self.codec) if lhs is None: return bool(chunk) if rhs is None: return bool(lhs) return cmp(lhs, rhs)
def process(self, data): meta = metavars(data) self.log_debug('regular expression:', self.regex) transformations = [] specs: List[bytes] = list(self.args.transformation) if not specs: specs.append(B'{0}') for spec in specs: def transformation(match: Match, s=spec.decode(self.codec)): symb: dict = match.groupdict() args: list = [match.group(0), *match.groups()] used = set() item = meta.format(s, self.codec, args, symb, True, True, used) for variable in used: symb.pop(variable, None) symb.update(offset=match.start()) for name, value in meta.items(): symb.setdefault(name, value) return self.labelled(item, **symb) transformations.append(transformation) yield from self.matches_filtered(memoryview(data), self.regex, *transformations)
def test_binary_formatter_fallback(self): data = self.generate_random_buffer(3210) meta = metavars(data) self.assertEqual(meta.format_bin('{size!r}', 'utf8', data).strip(), b'03.210 kB')
def process(self, data): colorama = self._colorama colorama.init(autoreset=False, convert=True) from sys import stderr nobg = not self.args.background meta = metavars(data) label = meta.format_str(self.args.label, self.codec, [data]) if label and not label.endswith(' '): label += ' ' bgmap = [ colorama.Back.BLACK, colorama.Back.WHITE, colorama.Back.YELLOW, colorama.Back.CYAN, colorama.Back.BLUE, colorama.Back.GREEN, colorama.Back.LIGHTRED_EX, colorama.Back.MAGENTA, ] fgmap = [ colorama.Fore.LIGHTBLACK_EX, colorama.Fore.LIGHTWHITE_EX, colorama.Fore.LIGHTYELLOW_EX, colorama.Fore.LIGHTCYAN_EX, colorama.Fore.LIGHTBLUE_EX, colorama.Fore.LIGHTGREEN_EX, colorama.Fore.LIGHTRED_EX, colorama.Fore.LIGHTMAGENTA_EX, ] _reset = colorama.Back.BLACK + colorama.Fore.WHITE + colorama.Style.RESET_ALL clrmap = fgmap if nobg else bgmap footer = '{}] [{:>7}]\n'.format(_reset, repr(meta['entropy'])) header = '[{1}{0}] ['.format( _reset, ''.join(F'{bg}{k}' for k, bg in enumerate(clrmap, 1))) header_length = 4 + len(clrmap) footer_length = 4 + len(str(meta['entropy'])) width = get_terminal_size() - header_length - footer_length - 1 if width < 16: raise RuntimeError( F'computed terminal width {width} is too small for heatmap') def entropy_select(value, map): index = min(len(map) - 1, math.floor(value * len(map))) return map[index] view = memoryview(data) size = len(data) chunk_size = 0 for block_size in range(1, width + 1): block_count = width // block_size chunk_size = size // block_count if chunk_size > 1024: break q, remainder = divmod(width, block_size) assert q == block_count indices = list(range(q)) random.seed(sum(view[:1024])) random.shuffle(indices) block_sizes = [block_size] * q q, r = divmod(remainder, block_count) for i in indices: block_sizes[i] += q for i in indices[:r]: block_sizes[i] += 1 assert sum(block_sizes) == width q, remainder = divmod(size, block_count) assert q == chunk_size chunk_sizes = [chunk_size] * block_count for i in indices[:remainder]: chunk_sizes[i] += 1 assert sum(chunk_sizes) == size stream = MemoryFile(view) filler = self.args.block_char if nobg else ' ' try: stderr.write(header) if label is not None: stderr.write(colorama.Fore.WHITE) stderr.flush() it = itertools.chain(label, itertools.cycle(filler)) for chunk_size, block_size in zip(chunk_sizes, block_sizes): chunk = stream.read(chunk_size) chunk_entropy = entropy(chunk) string = entropy_select(chunk_entropy, clrmap) + ''.join( itertools.islice(it, block_size)) stderr.write(string) stderr.flush() except BaseException: eraser = ' ' * width stderr.write(F'\r{_reset}{eraser}\r') raise else: stderr.write(footer) stderr.flush() if not self.isatty: yield data
def _peeklines(self, data: bytearray, colorize: bool) -> Generator[str, None, None]: meta = metavars(data) codec = None lines = None final = data.temp or False empty = True if not self.args.index: meta.discard('index') index = None else: index = meta.get('index', None) if not self.args.brief: padding = 0 else: padding = SizeInt.width + 2 if index is not None: padding += 6 metrics = self._get_metrics(len(data), self.args.lines, padding) if self.args.brief: metrics.address_width = 0 metrics.fit_to_width(allow_increase=True) sepsize = metrics.hexdump_width txtsize = self.args.width or sepsize if self.args.lines and data: if self.args.escape: lines = self._trydecode(data, None, txtsize, metrics.line_count) if self.args.decode: for codec in ('utf8', 'utf-16le', 'utf-16', 'utf-16be'): lines = self._trydecode(data, codec, txtsize, metrics.line_count) if lines: codec = codec break else: codec = None if lines is None: lines = list(self.hexdump(data, metrics, colorize)) else: sepsize = txtsize def separator(title=None): if title is None or sepsize <= len(title) + 8: return sepsize * '-' return '-' * (sepsize - len(title) - 5) + F'[{title}]---' if self.args.brief: final = False elif not self.args.bare: peek = repr(meta.size) if len(data) <= 5_000_000: peek = F'{peek}; {meta.entropy!r} entropy' peek = F'{peek}; {meta.magic!s}' if self.args.meta > 0: meta.derive('size') meta.derive('magic') meta.derive('entropy') peek = None if self.args.meta > 1: meta.derive('crc32') meta.derive('sha256') if self.args.meta > 2: for name in meta.DERIVATION_MAP: meta[name] for line in self._peekmeta(metrics.hexdump_width, separator(), _x_peek=peek, **meta): empty = False yield line if lines: empty = False if not self.args.brief: yield separator(codec or None) yield from lines else: brief = next(iter(lines)) brief = F'{SizeInt(len(data))!r}: {brief}' if index is not None: brief = F'#{index:03d}: {brief}' yield brief if final and not empty: yield separator()
def _key(chunk): return expression(metavars(chunk)), chunk
def process(self, data: bytearray): formatter = string.Formatter() until = self.args.until until = until and PythonExpression(until, all_variables_allowed=True) reader = StructReader(memoryview(data)) mainspec = self.args.spec byteorder = mainspec[:1] if byteorder in '<!=@>': mainspec = mainspec[1:] else: byteorder = '=' def fixorder(spec): if spec[0] not in '<!=@>': spec = byteorder + spec return spec it = itertools.count() if self.args.multi else (0, ) for index in it: if reader.eof: break if index >= self.args.count: break meta = metavars(data, ghost=True) meta['index'] = index args = [] last = None checkpoint = reader.tell() try: for prefix, name, spec, conversion in formatter.parse( mainspec): if prefix: args.extend(reader.read_struct(fixorder(prefix))) if name is None: continue if conversion: reader.byte_align( PythonExpression.evaluate(conversion, meta)) if spec: spec = meta.format_str(spec, self.codec, args) if spec != '': try: spec = PythonExpression.evaluate(spec, meta) except ParserError: pass if spec == '': last = value = reader.read() elif isinstance(spec, int): last = value = reader.read_bytes(spec) else: value = reader.read_struct(fixorder(spec)) if not value: self.log_warn(F'field {name} was empty, ignoring.') continue if len(value) > 1: self.log_info( F'parsing field {name} produced {len(value)} items reading a tuple' ) else: value = value[0] args.append(value) if name == _SHARP: raise ValueError( 'Extracting a field with name # is forbidden.') elif name.isdecimal(): index = int(name) limit = len(args) - 1 if index > limit: self.log_warn( F'cannot assign index field {name}, the highest index is {limit}' ) else: args[index] = value continue elif name: meta[name] = value if until and not until(meta): self.log_info( F'the expression ({until}) evaluated to zero; aborting.' ) break with StreamDetour(reader, checkpoint) as detour: full = reader.read(detour.cursor - checkpoint) if last is None: last = full outputs = [] for template in self.args.outputs: used = set() outputs.append( meta.format(template, self.codec, [full, *args], {_SHARP: last}, True, used=used)) for key in used: meta.pop(key, None) for output in outputs: chunk = self.labelled(output, **meta) chunk.set_next_batch(index) yield chunk except EOF: leftover = repr(SizeInt(len(reader) - checkpoint)).strip() self.log_info(F'discarding {leftover} left in buffer') break
def process(self, data): def shlexjoin(): import shlex return ' '.join(shlex.quote(cmd) for cmd in commandline) meta = metavars(data, ghost=True) used = set() commandline = [ meta.format(cmd, self.codec, [data], None, False, used=used) for cmd in self.args.commandline ] if 0 in used: self.log_info( 'input used as command-line argument; sending no input to process stdin' ) data = None self.log_debug(shlexjoin) posix = 'posix' in sys.builtin_module_names process = Popen(commandline, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=False, close_fds=posix) if self.args.buffer and not self.args.timeout: out, err = process.communicate(data) for line in err.splitlines(): self.log_info(line) yield out return import io from threading import Thread, Event from queue import Queue, Empty from time import process_time, sleep start = 0 result = None qerr = Queue() qout = Queue() done = Event() def adapter(stream, queue: Queue, event: Event): while not event.is_set(): out = stream.read1() if out: queue.put(out) else: break stream.close() recvout = Thread(target=adapter, args=(process.stdout, qout, done), daemon=True) recverr = Thread(target=adapter, args=(process.stderr, qerr, done), daemon=True) recvout.start() recverr.start() if data: process.stdin.write(data) process.stdin.close() start = process_time() if self.args.buffer or self.args.timeout: result = io.BytesIO() def queue_read(q: Queue): try: return q.get_nowait() except Empty: return None errbuf = io.BytesIO() while True: out = queue_read(qout) err = None if self.args.noerror: err = queue_read(qerr) else: out = out or queue_read(qerr) if err and self.log_info(): errbuf.write(err) errbuf.seek(0) lines = errbuf.readlines() errbuf.seek(0) errbuf.truncate() if lines: if not (done.is_set() or lines[~0].endswith(B'\n')): errbuf.write(lines.pop()) for line in lines: msg = line.rstrip(B'\n') if msg: self.log_info(msg) if out: if self.args.buffer or self.args.timeout: result.write(out) if not self.args.buffer: yield out if done.is_set(): if recverr.is_alive(): self.log_warn('stderr receiver thread zombied') if recvout.is_alive(): self.log_warn('stdout receiver thread zombied') break elif not err and not out and process.poll() is not None: recverr.join(self._JOIN_TIME) recvout.join(self._JOIN_TIME) done.set() elif self.args.timeout: if process_time() - start > self.args.timeout: self.log_info('terminating process after timeout expired') done.set() process.terminate() for wait in range(4): if process.poll() is not None: break sleep(self._JOIN_TIME) else: self.log_warn('process termination may have failed') recverr.join(self._JOIN_TIME) recvout.join(self._JOIN_TIME) if not len(result.getbuffer()): result = RuntimeError( 'timeout reached, process had no output') else: result = RefineryPartialResult( 'timeout reached, returning all collected output', partial=result.getvalue()) if isinstance(result, Exception): raise result elif self.args.buffer: yield result.getvalue()
def test_binary_printer_for_integer_arrays(self): data = Chunk() data['k'] = [t for t in b'refinery'] meta = metavars(data) self.assertEqual(meta.format_bin('{k:itob}', 'utf8', data), b'refinery')
def test_binary_formatter_literal(self): meta = metavars(B'') self.assertEqual(meta.format_bin('{726566696E657279!H}', 'utf8'), b'refinery') self.assertEqual(meta.format_bin('{refinery!a}', 'utf8'), 'refinery'.encode('latin1')) self.assertEqual(meta.format_bin('{refinery!s}', 'utf8'), 'refinery'.encode('utf8')) self.assertEqual(meta.format_bin('{refinery!u}', 'utf8'), 'refinery'.encode('utf-16le'))
def process(self, data: ByteString) -> ByteString: results: List[UnpackResult] = list(self.unpack(data)) strict = self._strict_path_matching paths = self.args.paths or (['.*'] if self.args.regex else ['*']) patterns = [PathPattern(p, self.args.regex, strict) for p in paths] if strict: self.log_debug('using string path matching') metavar = self.args.path.decode(self.codec) occurrences = collections.defaultdict(int) checksums = collections.defaultdict(set) root = Path('.') meta = metavars(data) def normalize(_path: str) -> str: path = Path(_path.replace('\\', '/')) try: path = path.relative_to('/') except ValueError: pass path = root / path path = path.as_posix() if self._custom_path_separator: path = path.replace('/', self._custom_path_separator) return path if self.args.join: try: root = ByteStringWrapper(meta[metavar], self.codec) except KeyError: pass for result in results: path = normalize(result.path) if not path: from refinery.lib.mime import FileMagicInfo ext = FileMagicInfo(result.get_data()).extension name = uuid.uuid4() path = F'{name}.{ext}' self.log_warn( F'read chunk with empty path; using generated name {path}') result.path = path occurrences[path] += 1 for result in results: path = result.path if occurrences[path] > 1: checksum = adler32(result.get_data()) if checksum in checksums[path]: continue checksums[path].add(checksum) counter = len(checksums[path]) base, extension = os.path.splitext(path) width = len(str(occurrences[path])) if any(F'{base}.v{c:0{width}d}{extension}' in occurrences for c in range(occurrences[path])): result.path = F'{base}.{uuid.uuid4()}{extension}' else: result.path = F'{base}.v{counter:0{width}d}{extension}' self.log_warn( F'read chunk with duplicate path; deduplicating to {result.path}' ) for p in patterns: for result in results: path = result.path if not p.check(path): continue if self.args.list: yield self.labelled(path.encode(self.codec), **result.meta) continue if not self.args.drop: result.meta[metavar] = path try: data = result.get_data() except Exception as error: if self.log_debug(): raise self.log_warn(F'extraction failure for {path}: {error!s}') else: self.log_debug(F'extraction success for {path}') yield self.labelled(data, **result.meta)