def main(argv, cfg): parser = ArgumentParser( usage="%(prog)s [options] pattern ds [ds [...]] [column [column [...]]", prog=argv.pop(0), ) parser.add_argument( '-c', '--chain', action='store_true', help="follow dataset chains", ) parser.add_argument( '--colour', '--color', nargs='?', const='always', choices=['auto', 'never', 'always'], type=str.lower, help="colour matched text. can be auto, never or always", metavar='WHEN', ) parser.add_argument( '-i', '--ignore-case', action='store_true', help="case insensitive pattern", ) parser.add_argument( '-H', '--headers', action='store_true', help="print column names before output (and on each change)", ) parser.add_argument( '-O', '--ordered', action='store_true', help="output in order (one slice at a time)", ) parser.add_argument( '-g', '--grep', action='append', help="grep this column only, can be specified multiple times", metavar='COLUMN') parser.add_argument( '-s', '--slice', action='append', help="grep this slice only, can be specified multiple times", type=int) parser.add_argument( '-D', '--show-dataset', action='store_true', help="show dataset on matching lines", ) parser.add_argument( '-S', '--show-sliceno', action='store_true', help="show sliceno on matching lines", ) parser.add_argument( '-L', '--show-lineno', action='store_true', help="show lineno (per slice) on matching lines", ) supported_formats = ( 'csv', 'raw', 'json', ) parser.add_argument( '-f', '--format', default='csv', choices=supported_formats, help="output format, csv (default) / " + ' / '.join(supported_formats[1:]), metavar='FORMAT', ) parser.add_argument( '-t', '--separator', help="field separator, default tab / tab-like spaces", ) parser.add_argument('pattern') parser.add_argument( 'dataset', help='can be specified in the same ways as for "ax ds"') parser.add_argument('columns', nargs='*', default=[]) args = parser.parse_intermixed_args(argv) pat_s = re.compile(args.pattern, re.IGNORECASE if args.ignore_case else 0) datasets = [name2ds(cfg, args.dataset)] columns = [] for ds_or_col in args.columns: if columns: columns.append(ds_or_col) else: try: datasets.append(name2ds(cfg, ds_or_col)) except Exception: columns.append(ds_or_col) if not datasets: parser.print_help(file=sys.stderr) return 1 grep_columns = set(args.grep or ()) if grep_columns == set(columns): grep_columns = None if args.slice: want_slices = [] for s in args.slice: assert 0 <= s < g.slices, "Slice %d not available" % (s, ) if s not in want_slices: want_slices.append(s) else: want_slices = list(range(g.slices)) if args.chain: datasets = list(chain.from_iterable(ds.chain() for ds in datasets)) if columns or grep_columns: bad = False need_cols = set(columns) if grep_columns: need_cols.update(grep_columns) for ds in datasets: missing = need_cols - set(ds.columns) if missing: print('ERROR: %s does not have columns %r' % ( ds, missing, ), file=sys.stderr) bad = True if bad: return 1 # never and always override env settings, auto (default) sets from env/tty if args.colour == 'never': colour.disable() highlight_matches = False elif args.colour == 'always': colour.enable() highlight_matches = True else: highlight_matches = colour.enabled # Don't highlight everything when just trying to cat if args.pattern == '': highlight_matches = False separator = args.separator if separator is None and not sys.stdout.isatty(): separator = '\t' if separator is None: # special case where we try to be like a tab, but with spaces. # this is useful because terminals typically don't style tabs. def separate(items, lens): things = [] for item, item_len in zip(items, lens): things.append(item) spaces = 8 - (item_len % 8) things.append(colour(' ' * spaces, 'cyan', 'underline')) return ''.join(things[:-1]) separator = '\t' else: separator_coloured = colour(separator, 'cyan', 'underline') def separate(items, lens): return separator_coloured.join(items) def json_default(obj): if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)): return str(obj) elif isinstance(obj, complex): return [obj.real, obj.imag] else: return repr(obj) if args.format == 'csv': def escape_item(item): if item and (separator in item or item[0] in '\'"' or item[-1] in '\'"'): return '"' + item.replace('\n', '\\n').replace('"', '""') + '"' else: return item.replace('\n', '\\n') errors = 'surrogatepass' else: escape_item = None errors = 'replace' if PY2 else 'surrogateescape' def grep(ds, sliceno): def no_conv(v): return v def mk_conv(col): if ds.columns[col].type in ( 'bytes', 'unicode', 'ascii', ): if not ds.columns[col].none_support: return no_conv return unicode chk = pat_s.search def mk_iter(col): if ds.columns[col].type == 'ascii': it = ds._column_iterator(sliceno, col, _type='unicode') else: it = ds._column_iterator(sliceno, col) if ds.columns[col].type == 'bytes': errors = 'replace' if PY2 else 'surrogateescape' if ds.columns[col].none_support: it = (None if v is None else v.decode('utf-8', errors) for v in it) else: it = (v.decode('utf-8', errors) for v in it) return it def colour_item(item): pos = 0 parts = [] for m in pat_s.finditer(item): a, b = m.span() parts.extend((item[pos:a], colour.red(item[a:b]))) pos = b parts.append(item[pos:]) return ''.join(parts) if args.format == 'json': prefix = {} dumps = json.JSONEncoder(ensure_ascii=False, default=json_default).encode if args.show_dataset: prefix['dataset'] = ds if args.show_sliceno: prefix['sliceno'] = sliceno def show(): d = dict(zip(used_columns, items)) if args.show_lineno: prefix['lineno'] = lineno if prefix: prefix['data'] = d d = prefix return dumps(d).encode('utf-8', 'surrogatepass') else: prefix = [] if args.show_dataset: prefix.append(ds) if args.show_sliceno: prefix.append(str(sliceno)) prefix = tuple(prefix) def show(): data = list(prefix) if args.show_lineno: data.append(unicode(lineno)) if PY2: show_items = (v if isinstance(v, unicode) else str(v).decode('utf-8', 'replace') for v in items) else: show_items = map(str, items) show_items = list(show_items) lens = (len(item) for item in data + show_items) if highlight_matches: show_items = list(map(colour_item, show_items)) if escape_item: lens_unesc = (len(item) for item in data + show_items) show_items = list(map(escape_item, show_items)) lens_esc = (len(item) for item in data + show_items) lens = ( l + esc - unesc for l, unesc, esc in zip(lens, lens_unesc, lens_esc)) data.extend(show_items) return separate(data, lens).encode('utf-8', errors) used_columns = columns or sorted(ds.columns) if grep_columns and grep_columns != set(used_columns): grep_iter = izip(*(mk_iter(col) for col in grep_columns)) conv_items = [mk_conv(col) for col in grep_columns] else: grep_iter = repeat(None) conv_items = [mk_conv(col) for col in used_columns] lines_iter = izip(*(mk_iter(col) for col in used_columns)) for lineno, (grep_items, items) in enumerate(izip(grep_iter, lines_iter)): if any( chk(conv(item)) for conv, item in izip(conv_items, grep_items or items)): # This will be atomic if the line is not too long # (at least up to PIPE_BUF bytes, should be at least 512). write(1, show() + b'\n') def one_slice(sliceno, q, wait_for): try: if q: q.get() for ds in datasets: if ds in wait_for: q.task_done() q.get() grep(ds, sliceno) except KeyboardInterrupt: return except IOError as e: if e.errno == errno.EPIPE: return else: raise finally: # Make sure we are joinable try: q.task_done() except Exception: pass headers_prefix = [] if args.show_dataset: headers_prefix.append('[DATASET]') if args.show_sliceno: headers_prefix.append('[SLICE]') if args.show_lineno: headers_prefix.append('[LINE]') headers = {} if args.headers: if columns: current_headers = columns else: current_headers = None for ds in datasets: candidate_headers = sorted(ds.columns) if candidate_headers != current_headers: headers[ds] = current_headers = candidate_headers current_headers = headers.pop(datasets[0]) def show_headers(headers): if args.format != 'json': show_items = headers_prefix + headers if escape_item: show_items = list(map(escape_item, show_items)) print( separate(map(colour.blue, show_items), map(len, show_items))) show_headers(current_headers) queues = [] children = [] if not args.ordered: q = None wait_for = set(headers) for sliceno in want_slices[1:]: if wait_for: q = JoinableQueue() q.put(None) queues.append(q) p = Process( target=one_slice, args=(sliceno, q, wait_for), name='slice-%d' % (sliceno, ), ) p.daemon = True p.start() children.append(p) want_slices = want_slices[:1] try: for ds in datasets: if ds in headers: for q in queues: q.join() show_headers(headers.pop(ds)) for q in queues: q.put(None) for sliceno in want_slices: grep(ds, sliceno) for c in children: c.join() except KeyboardInterrupt: print()
def main(): # As of python 3.8 the default start_method is 'spawn' on macOS. # This doesn't work for us. 'fork' is fairly unsafe on macOS, # but it's better than not working at all. See # https://bugs.python.org/issue33725 # for more information. import multiprocessing if hasattr(multiprocessing, 'set_start_method'): # If possible, make the forkserver (used by database updates) pre-import everthing if hasattr(multiprocessing, 'set_forkserver_preload'): multiprocessing.set_forkserver_preload( ['accelerator', 'accelerator.server']) multiprocessing.set_start_method('fork') from accelerator import g g.running = 'shell' from accelerator.autoflush import AutoFlush main_argv, argv = split_args(sys.argv[1:]) sys.stdout = AutoFlush(sys.stdout) sys.stderr = AutoFlush(sys.stderr) aliases = { 'cat': 'grep ""', } aliases.update(parse_user_config() or ()) while argv and argv[0] in aliases: try: expanded = shlex.split(aliases[argv[0]]) except ValueError as e: raise ValueError('Failed to expand alias %s (%r): %s' % ( argv[0], aliases[argv[0]], e, )) more_main_argv, argv = split_args(expanded + argv[1:]) main_argv.extend(more_main_argv) epilog = ['commands:', ''] cmdlen = max(len(cmd) for cmd in COMMANDS) template = ' %%%ds %%s' % (cmdlen, ) for cmd, func in sorted(COMMANDS.items()): epilog.append(template % ( cmd, func.help, )) epilog.append('') epilog.append('aliases:') epilog.extend('%s = %s' % item for item in sorted(aliases.items())) epilog.append('') epilog.append('use %(prog)s <command> --help for <command> usage') parser = ArgumentParser( usage='%(prog)s [--config CONFIG_FILE] command [args]', epilog='\n'.join(epilog), formatter_class=RawDescriptionHelpFormatter, ) parser.add_argument('--config', metavar='CONFIG_FILE', help='configuration file') parser.add_argument('--version', action='store_true', help='alias for the version command') args = parser.parse_args(main_argv) if args.version: sys.exit(cmd_version(())) args.command = argv.pop(0) if argv else None if args.command not in COMMANDS: parser.print_help(file=sys.stderr) print(file=sys.stderr) if args.command is not None: print('Unknown command "%s"' % (args.command, ), file=sys.stderr) sys.exit(2) config_fn = args.config if args.command == 'init': config_fn = False cmd = COMMANDS[args.command] debug_cmd = getattr(cmd, 'is_debug', False) try: setup(config_fn, debug_cmd) argv.insert(0, '%s %s' % ( basename(sys.argv[0]), args.command, )) return cmd(argv) except UserError as e: print(e, file=sys.stderr) return 1 except IOError as e: if e.errno == errno.EPIPE and debug_cmd: return else: raise
def main(argv, cfg): usage = "%(prog)s [options] pattern ds [ds [...]] [column [column [...]]" parser = ArgumentParser(usage=usage, prog=argv.pop(0)) parser.add_argument( '-c', '--chain', action='store_true', help="follow dataset chains", ) parser.add_argument( '-C', '--color', action='store_true', help="color matched text", ) parser.add_argument( '-i', '--ignore-case', action='store_true', help="case insensitive pattern", ) parser.add_argument( '-H', '--headers', action='store_true', help="print column names before output (and on each change)", ) parser.add_argument( '-o', '--ordered', action='store_true', help="output in order (one slice at a time)", ) parser.add_argument( '-g', '--grep', action='append', help="grep this column only, can be specified multiple times", metavar='COLUMN') parser.add_argument( '-s', '--slice', action='append', help="grep this slice only, can be specified multiple times", type=int) parser.add_argument('-t', '--separator', help="field separator (default tab)", default='\t') parser.add_argument( '-D', '--show-dataset', action='store_true', help="show dataset on matching lines", ) parser.add_argument( '-S', '--show-sliceno', action='store_true', help="show sliceno on matching lines", ) parser.add_argument( '-L', '--show-lineno', action='store_true', help="show lineno (per slice) on matching lines", ) parser.add_argument('pattern') parser.add_argument( 'dataset', help='can be specified in the same ways as for "ax ds"') parser.add_argument('columns', nargs='*', default=[]) args = parser.parse_intermixed_args(argv) pat_s = re.compile(args.pattern, re.IGNORECASE if args.ignore_case else 0) pat_b = re.compile(args.pattern.encode('utf-8'), re.IGNORECASE if args.ignore_case else 0) datasets = [name2ds(cfg, args.dataset)] columns = [] separator_s = args.separator separator_b = separator_s.encode('utf-8') for ds_or_col in args.columns: if columns: columns.append(ds_or_col) else: try: datasets.append(name2ds(cfg, ds_or_col)) except Exception: columns.append(ds_or_col) if not datasets: parser.print_help(file=sys.stderr) return 1 grep_columns = set(args.grep or ()) if grep_columns == set(columns): grep_columns = None if args.slice: want_slices = [] for s in args.slice: assert 0 <= s < g.slices, "Slice %d not available" % (s, ) if s not in want_slices: want_slices.append(s) else: want_slices = list(range(g.slices)) if args.chain: datasets = list(chain.from_iterable(ds.chain() for ds in datasets)) if columns: bad = False for ds in datasets: missing = set(columns) - set(ds.columns) if missing: print('ERROR: %s does not have columns %r' % ( ds, missing, ), file=sys.stderr) bad = True if bad: return 1 def grep(ds, sliceno): # Use bytes for everything if anything is bytes, str otherwise. (For speed.) if any(ds.columns[col].backing_type == 'bytes' for col in (grep_columns or columns or ds.columns)): def strbytes(v): return str(v).encode('utf-8', 'replace') def mk_iter(col): if ds.columns[col].backing_type in ( 'bytes', 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='bytes') else: return imap(strbytes, ds._column_iterator(sliceno, col)) chk = pat_b.search else: def mk_iter(col): if ds.columns[col].backing_type in ( 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='unicode') else: return imap(str, ds._column_iterator(sliceno, col)) chk = pat_s.search def fmt(v): if not isinstance(v, (unicode, bytes)): v = str(v) if isinstance(v, unicode): v = v.encode('utf-8', 'replace') return v def color(item): pos = 0 parts = [] for m in pat_b.finditer(item): a, b = m.span() parts.extend((item[pos:a], b'\x1b[31m', item[a:b], b'\x1b[m')) pos = b parts.append(item[pos:]) return b''.join(parts) prefix = [] if args.show_dataset: prefix.append(ds.encode('utf-8')) if args.show_sliceno: prefix.append(str(sliceno).encode('utf-8')) prefix = tuple(prefix) def show(prefix, items): items = map(fmt, items) if args.color: items = map(color, items) # This will be atomic if the line is not too long # (at least up to PIPE_BUF bytes, should be at least 512). write(1, separator_b.join(prefix + tuple(items)) + b'\n') if grep_columns and grep_columns != set(columns or ds.columns): grep_iter = izip(*(mk_iter(col) for col in grep_columns)) lines_iter = ds.iterate(sliceno, columns) else: grep_iter = repeat(None) lines_iter = izip(*(mk_iter(col) for col in (columns or sorted(ds.columns)))) lines = izip(grep_iter, lines_iter) if args.show_lineno: for lineno, (grep_items, items) in enumerate(lines): if any(imap(chk, grep_items or items)): show(prefix + (str(lineno).encode('utf-8'), ), items) else: for grep_items, items in lines: if any(imap(chk, grep_items or items)): show(prefix, items) def one_slice(sliceno, q, wait_for): try: if q: q.get() for ds in datasets: if ds in wait_for: q.task_done() q.get() grep(ds, sliceno) except KeyboardInterrupt: return except IOError as e: if e.errno == errno.EPIPE: return else: raise finally: # Make sure we are joinable try: q.task_done() except Exception: pass headers_prefix = [] if args.show_dataset: headers_prefix.append('[DATASET]') if args.show_sliceno: headers_prefix.append('[SLICE]') if args.show_lineno: headers_prefix.append('[LINE]') headers = {} if args.headers: if columns: current_headers = columns else: current_headers = None for ds in datasets: candidate_headers = sorted(ds.columns) if candidate_headers != current_headers: headers[ds] = current_headers = candidate_headers current_headers = headers.pop(datasets[0]) def show_headers(headers): print('\x1b[34m' + separator_s.join(headers_prefix + headers) + '\x1b[m') show_headers(current_headers) queues = [] children = [] if not args.ordered: q = None wait_for = set(headers) for sliceno in want_slices[1:]: if wait_for: q = JoinableQueue() q.put(None) queues.append(q) p = Process( target=one_slice, args=(sliceno, q, wait_for), name='slice-%d' % (sliceno, ), ) p.daemon = True p.start() children.append(p) want_slices = want_slices[:1] try: for ds in datasets: if ds in headers: for q in queues: q.join() show_headers(headers.pop(ds)) for q in queues: q.put(None) for sliceno in want_slices: grep(ds, sliceno) for c in children: c.join() except KeyboardInterrupt: print()
def main(argv, cfg): # -C overrides -A and -B (which in turn override -C) class ContextAction(Action): def __call__(self, parser, namespace, values, option_string=None): namespace.before_context = namespace.after_context = values parser = ArgumentParser( usage= "%(prog)s [options] [-e] pattern [...] [-d] ds [...] [[-n] column [...]]", description="""positional arguments: pattern (-e, --regexp) dataset (-d, --dataset) can be specified as for "ax ds" columns (-n, --column)""", prog=argv.pop(0), formatter_class=RawTextHelpFormatter, ) parser.add_argument( '-c', '--chain', action='store_true', help="follow dataset chains", ) parser.add_argument( '--colour', '--color', nargs='?', const='always', choices=['auto', 'never', 'always'], type=str.lower, help="colour matched text. can be auto, never or always", metavar='WHEN', ) parser.add_argument( '-i', '--ignore-case', action='store_true', help="case insensitive pattern", ) parser.add_argument( '-v', '--invert-match', action='store_true', help="select non-matching lines", ) parser.add_argument( '-o', '--only-matching', action='store_true', help="only print matching part (or columns with -l)", ) parser.add_argument( '-l', '--list-matching', action='store_true', help= "only print matching datasets (or slices with -S)\nwhen used with -o, only print matching columns", ) parser.add_argument( '-H', '--headers', action='store_true', help="print column names before output (and on each change)", ) parser.add_argument( '-O', '--ordered', action='store_true', help="output in order (one slice at a time)", ) parser.add_argument( '-M', '--allow-missing-columns', action='store_true', help="datasets are allowed to not have (some) columns", ) parser.add_argument( '-g', '--grep', action='append', help="grep this column only, can be specified multiple times", metavar='COLUMN') parser.add_argument( '-s', '--slice', action='append', help="grep this slice only, can be specified multiple times", type=int) parser.add_argument( '-D', '--show-dataset', action='store_true', help="show dataset on matching lines", ) parser.add_argument( '-S', '--show-sliceno', action='store_true', help="show sliceno on matching lines", ) parser.add_argument( '-L', '--show-lineno', action='store_true', help="show lineno (per slice) on matching lines", ) supported_formats = ( 'csv', 'raw', 'json', ) parser.add_argument( '-f', '--format', default='csv', choices=supported_formats, help="output format, csv (default) / " + ' / '.join(supported_formats[1:]), metavar='FORMAT', ) parser.add_argument( '-t', '--separator', help="field separator, default tab / tab-like spaces", ) parser.add_argument( '-T', '--tab-length', type=int, metavar='LENGTH', help="field alignment, always uses spaces as separator", ) parser.add_argument( '-B', '--before-context', type=int, default=0, metavar='NUM', help="print NUM lines of leading context", ) parser.add_argument( '-A', '--after-context', type=int, default=0, metavar='NUM', help="print NUM lines of trailing context", ) parser.add_argument( '-C', '--context', type=int, default=0, metavar='NUM', action=ContextAction, help="print NUM lines of context\n" + "context is only taken from the same slice of the same\n" + "dataset, and may intermix with output from other\n" + "slices. Use -O to avoid that, or -S -L to see it.", ) parser.add_argument('-e', '--regexp', default=[], action='append', dest='patterns', help=SUPPRESS) parser.add_argument('-d', '--dataset', default=[], action='append', dest='datasets', help=SUPPRESS) parser.add_argument('-n', '--column', default=[], action='append', dest='columns', help=SUPPRESS) parser.add_argument('words', nargs='*', help=SUPPRESS) args = parser.parse_intermixed_args(argv) if args.before_context < 0 or args.after_context < 0: print('Context must be >= 0', file=sys.stderr) return 1 columns = args.columns try: args.datasets = [name2ds(cfg, ds) for ds in args.datasets] except NoSuchWhateverError as e: print(e, file=sys.stderr) return 1 for word in args.words: if not args.patterns: args.patterns.append(word) elif columns and args.datasets: columns.append(word) else: try: args.datasets.append(name2ds(cfg, word)) except NoSuchWhateverError as e: if not args.datasets: print(e, file=sys.stderr) return 1 columns.append(word) if not args.patterns or not args.datasets: parser.print_help(file=sys.stderr) return 1 datasets = args.datasets patterns = [] for pattern in args.patterns: try: patterns.append( re.compile(pattern, re.IGNORECASE if args.ignore_case else 0)) except re.error as e: print("Bad pattern %r:\n%s" % ( pattern, e, ), file=sys.stderr) return 1 grep_columns = set(args.grep or ()) if grep_columns == set(columns): grep_columns = set() if args.slice: want_slices = [] for s in args.slice: assert 0 <= s < g.slices, "Slice %d not available" % (s, ) if s not in want_slices: want_slices.append(s) else: want_slices = list(range(g.slices)) if len(want_slices) == 1: # it will be automatically ordered, so let's not work for it. args.ordered = False if args.only_matching: if args.list_matching: args.list_matching = False only_matching = 'columns' else: only_matching = 'part' else: only_matching = False if args.chain: datasets = list(chain.from_iterable(ds.chain() for ds in datasets)) def columns_for_ds(ds, columns=columns): if columns: return [n for n in columns if n in ds.columns] else: return sorted(ds.columns) if columns or grep_columns: if args.allow_missing_columns: keep_datasets = [] for ds in datasets: if not columns_for_ds(ds): continue if grep_columns and not columns_for_ds(ds, grep_columns): continue keep_datasets.append(ds) if not keep_datasets: return 0 datasets = keep_datasets else: bad = False need_cols = set(columns) if grep_columns: need_cols.update(grep_columns) for ds in datasets: missing = need_cols - set(ds.columns) if missing: print('ERROR: %s does not have columns %r' % ( ds, missing, ), file=sys.stderr) bad = True if bad: return 1 # For the status reporting, this gives how many lines have been processed # when reaching each ds ix, per slice. Ends with an extra fictional ds, # i.e. the total number of lines for that slice. And then the same again, # to simplify the code in the status shower. total_lines_per_slice_at_ds = [[0] * g.slices] for ds in datasets: total_lines_per_slice_at_ds.append( [a + b for a, b in zip(total_lines_per_slice_at_ds[-1], ds.lines)]) total_lines_per_slice_at_ds.append(total_lines_per_slice_at_ds[-1]) status_interval = { # twice per percent, but not too often or too seldom sliceno: min(max(total_lines_per_slice_at_ds[-1][sliceno] // 200, 10), 5000) for sliceno in want_slices } # never and always override env settings, auto (default) sets from env/tty if args.colour == 'never': colour.disable() highlight_matches = False elif args.colour == 'always': colour.enable() highlight_matches = True else: args.colour = 'auto' highlight_matches = colour.enabled # Don't highlight everything when just trying to cat if args.patterns == ['']: highlight_matches = False # Don't highlight anything with -l if args.list_matching: highlight_matches = False if args.format == 'json': # headers was just a mistake, ignore it args.headers = False separator = args.separator if args.tab_length: separator = None elif separator is None and not sys.stdout.isatty(): separator = '\t' if separator is None: # special case where we try to be like a tab, but with spaces. # this is useful because terminals typically don't style tabs. # and also so you can change the length of tabs. if (args.tab_length or 0) < 1: args.tab_length = 8 def separate(items, lens): things = [] for item, item_len in zip(items, lens): things.append(item) spaces = args.tab_length - (item_len % args.tab_length) things.append(colour(' ' * spaces, 'grep/separator')) return ''.join(things[:-1]) separator = '\t' else: separator_coloured = colour(separator, 'grep/separator') def separate(items, lens): return separator_coloured.join(items) def json_default(obj): if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)): return str(obj) elif isinstance(obj, complex): return [obj.real, obj.imag] else: return repr(obj) if args.format == 'csv': def escape_item(item): if item and (separator in item or item[0] in '\'"' or item[-1] in '\'"'): return '"' + item.replace('\n', '\\n').replace('"', '""') + '"' else: return item.replace('\n', '\\n') errors = 'surrogatepass' else: escape_item = None errors = 'replace' if PY2 else 'surrogateescape' # This is for the ^T handling. Each slice sends an update when finishing # a dataset, and every status_interval[sliceno] lines while iterating. # To minimise the data sent the only information sent over the queue # is (sliceno, finished_dataset). # Status printing is triggered by ^T (or SIGINFO if that is available) # or by SIGUSR1. # Pressing it again within two seconds prints stats per slice too. q_status = mp.LockFreeQueue() def status_collector(): q_status.make_reader() status = {sliceno: [0, 0] for sliceno in want_slices} # [ds_ix, done_lines] total_lines = sum(total_lines_per_slice_at_ds[-1]) previous = [0] # base colour conf in if stderr is a tty, not stdout. if args.colour == 'auto': colour.configure_from_environ(stdout=sys.stderr) def show(sig, frame): t = monotonic() verbose = (previous[0] + 2 > t) # within 2 seconds of previous previous[0] = t ds_ixes = [] progress_lines = [] progress_fraction = [] for sliceno in want_slices: ds_ix, done_lines = status[sliceno] ds_ixes.append(ds_ix) max_possible = min( done_lines + status_interval[sliceno], total_lines_per_slice_at_ds[ds_ix + 1][sliceno]) done_lines = (done_lines + max_possible) / 2 # middle of the possibilities progress_lines.append(done_lines) total = total_lines_per_slice_at_ds[-1][sliceno] if total == 0: progress_fraction.append(1) else: progress_fraction.append(done_lines / total) progress_total = sum(progress_lines) / (total_lines or 1) bad_cutoff = progress_total - 0.1 if verbose: show_ds = (len(datasets) > 1 and min(ds_ixes) != max(ds_ixes)) for sliceno, ds_ix, p in zip(want_slices, ds_ixes, progress_fraction): if ds_ix == len(datasets): msg = 'DONE' else: msg = '{0:d}% of {1:n} lines'.format( round(p * 100), total_lines_per_slice_at_ds[-1][sliceno]) if show_ds: msg = '%s (in %s)' % ( msg, datasets[ds_ix].quoted, ) msg = '%9d: %s' % ( sliceno, msg, ) if p < bad_cutoff: msg = colour(msg, 'grep/infohighlight') else: msg = colour(msg, 'grep/info') write(2, msg.encode('utf-8') + b'\n') msg = '{0:d}% of {1:n} lines'.format(round(progress_total * 100), total_lines) if len(datasets) > 1: min_ds = min(ds_ixes) max_ds = max(ds_ixes) if min_ds < len(datasets): ds_name = datasets[min_ds].quoted extra = '' if min_ds == max_ds else ' ++' msg = '%s (in %s%s)' % ( msg, ds_name, extra, ) worst = min(progress_fraction) if worst < bad_cutoff: msg = '%s, worst %d%%' % ( msg, round(worst * 100), ) msg = colour(' SUMMARY: %s' % (msg, ), 'grep/info') write(2, msg.encode('utf-8') + b'\n') for signame in ('SIGINFO', 'SIGUSR1'): if hasattr(signal, signame): sig = getattr(signal, signame) signal.signal(sig, show) if hasattr(signal, 'pthread_sigmask'): signal.pthread_sigmask(signal.SIG_UNBLOCK, {sig}) tc_original = None using_stdin = False if not hasattr(signal, 'SIGINFO') and sys.stdin.isatty(): # ^T wont work automatically on this OS, so we need to handle it as terminal input import termios from accelerator.compat import selectors sel = selectors.DefaultSelector() sel.register(0, selectors.EVENT_READ) sel.register(q_status.r, selectors.EVENT_READ) try: tc_original = termios.tcgetattr(0) tc_changed = list(tc_original) tc_changed[3] &= ~(termios.ICANON | termios.IEXTEN) termios.tcsetattr(0, termios.TCSADRAIN, tc_changed) using_stdin = True except Exception: pass # we can't set stdin nonblocking, because it's probably the same # file description as stdout, so work around that with alarms. def got_alarm(sig, frame): raise IOError() signal.signal(signal.SIGALRM, got_alarm) try: while True: if using_stdin: do_q = False for key, _ in sel.select(): if key.fd == 0: try: signal.alarm( 1 ) # in case something else read it we block for max 1 second try: pressed = ord(os.read(0, 1)) finally: signal.alarm(0) if pressed == 20: write(2, b'\n') # "^T" shows in the terminal os.kill(os.getpid(), signal.SIGUSR1) except Exception: pass elif key.fd == q_status.r: do_q = True if not do_q: continue try: sliceno, finished_dataset = q_status.get() except QueueEmpty: return if finished_dataset: ds_ix = status[sliceno][0] + 1 status[sliceno] = [ ds_ix, total_lines_per_slice_at_ds[ds_ix][sliceno] ] else: status[sliceno][1] += status_interval[sliceno] finally: if tc_original is not None: try: termios.tcsetattr(0, termios.TCSADRAIN, tc_original) except Exception: pass status_process = mp.SimplifiedProcess(target=status_collector, name='ax grep status') # everything else will write, so make it a writer right away q_status.make_writer() # Output is only allowed while holding this lock, so that long lines # do not get intermixed. (Or when alone in producing output.) io_lock = Lock() # This contains some extra stuff to be a better base for the other # outputters. # When used directly it enforces no ordering, but merges smaller writes # to keep the number of syscalls down. class Outputter: def __init__(self, q_in, q_out): self.q_in = q_in self.q_out = q_out self.buffer = [] self.merge_buffer = b'' def put(self, data): self.merge_buffer += data if len(self.merge_buffer) >= 1024: self.move_merge() def move_merge(self): if self.merge_buffer: with io_lock: write(1, self.merge_buffer) self.merge_buffer = b'' def start(self, ds): pass def end(self, ds): self.move_merge() def finish(self): pass def full(self): return len(self.buffer) > 5000 def excite(self): self.move_merge() if self.buffer: self.pump(False) # Partially ordered output, each header change acts as a fence. # This is used in all slices except the first. # # The queue gets True when the previous slice is ready for the next # header change, and None when the header is printed (and it's ok # to resume output). class HeaderWaitOutputter(Outputter): def start(self, ds): if ds in headers: self.add_wait() else: self.excite() def add_wait(self): # Each sync point is separated by None in the buffer self.buffer.append(None) self.buffer.append(b'') # Avoid need for special case in .drain self.pump() def move_merge(self): data = self.merge_buffer self.merge_buffer = b'' if self.buffer: self.pump() if self.buffer: self.buffer.append(data) return with io_lock: write(1, data) def pump(self, wait=None): if wait is None: wait = self.full() try: got = self.q_in.get(wait) except QueueEmpty: if wait: # previous slice has exited without sending all messages raise return if got is True: # since pump is only called when we have outputted all # currently allowed output or when the next message is an # unblock for such output we can just unconditionally send # the True on to the next slice here. self.q_out.put(True) self.pump(wait) return else: self.q_out.put(None) self.drain() def drain(self): assert self.buffer[ 0] is None, 'The buffer must always stop at a sync point (or empty)' with io_lock: for pos, data in enumerate(self.buffer[1:], 1): if data is None: break elif data: write(1, data) else: # We did not reach the next fence, so last item is real data # and needs to be removed. (The buffer will then be empty and # output will continue directly until reaching the sync point.) pos += 1 self.buffer[:pos] = () def finish(self): while self.buffer: self.pump(True) # Partially ordered output, each header change acts as a fence. # This is used only in the first slice, and outputs the headers. # # When it is ready to output headers it sends True in the queue. # When the True has travelled around the queue ring all slices are # ready, the headers are printed, and None is sent to let the other # slices resume output. # (When the None returns it is ignored, because output is resumed # as soon as the headers are printed.) class HeaderOutputter(HeaderWaitOutputter): def add_wait(self): if not self.buffer: self.q_out.put(True) self.buffer.append(None) self.buffer.append( b'') # Avoid need for special case in .drain/.put self.pump() def drain(self): assert self.buffer[ 0] is None, 'The buffer must always stop at a sync point (or empty)' with io_lock: for pos, data in enumerate(self.buffer[1:], 1): if data is None: self.q_out.put(True) break elif data: write(1, data) else: pos += 1 self.buffer[:pos] = () def pump(self, wait=None): if wait is None: wait = self.full() try: got = self.q_in.get(wait) except QueueEmpty: if wait: # previous slice has exited without sending all messages raise return if got is True: # The True we put in when reaching the fence has travelled # all the way around the queue ring, it's time to print the # new headers write(1, next(headers_iter)) # and then unblock the other slices self.q_out.put(None) self.drain() # No else, when the None comes back we just drop it. if not wait: self.pump(False) # Fully ordered output, each slice waits for the previous slice. # For each ds, waits for None (anything really) before starting, # sends None when done. class OrderedOutputter(Outputter): def start(self, ds): # Each ds is separated by None in the buffer self.buffer.append(None) self.buffer.append(b'') # Avoid need for special case in .drain self.pump() def end(self, ds): self.move_merge() if not self.buffer: # We are done with this ds, so let next slice continue self.q_out.put(None) def pump(self, wait=None): if wait is None: wait = self.full() try: self.q_in.get(wait) except QueueEmpty: if wait: # previous slice has exited without sending all messages raise return self.drain() def move_merge(self): data = self.merge_buffer self.merge_buffer = b'' if self.buffer: self.pump() if self.buffer: self.buffer.append(data) return # No need for a lock, the other slices aren't writing concurrently. write(1, data) def drain(self): assert self.buffer[0] is None for pos, data in enumerate(self.buffer[1:], 1): if data is None: # We are done with this ds, so let next slice continue self.q_out.put(None) break elif data: write(1, data) else: # We did not reach the next ds, so last item is real data and # needs to be removed. (The buffer will then be empty and # output will continue directly until reaching the next ds.) pos += 1 self.buffer[:pos] = () def finish(self): not_finished = bool(self.buffer) while self.buffer: self.pump(True) if not_finished: self.q_out.put(None) # Same as above but for the first slice so it prints headers when needed. class OrderedHeaderOutputter(OrderedOutputter): def start(self, ds): # Each ds is separated by None in the buffer self.buffer.append(None) if ds in headers: # Headers changed, start with those. self.buffer.append(next(headers_iter)) else: self.buffer.append( b'') # Avoid need for special case in .drain self.pump() # Choose the right outputter for the kind of sync we need. def outputter(q_in, q_out, first_slice=False): if args.list_matching: cls = Outputter elif args.ordered: if first_slice: cls = OrderedHeaderOutputter else: cls = OrderedOutputter elif headers: if first_slice: cls = HeaderOutputter else: cls = HeaderWaitOutputter else: cls = Outputter return cls(q_in, q_out) # Make printer for the selected output options def make_show(prefix, used_columns): def matching_ranges(item): ranges = [] for p in patterns: ranges.extend(m.span() for m in p.finditer(item)) if not ranges: return # merge overlapping/adjacent ranges ranges.sort() ranges = iter(ranges) start, stop = next(ranges) for a, b in ranges: if a <= stop: stop = max(stop, b) else: yield start, stop start, stop = a, b yield start, stop def filter_item(item): return ''.join(item[a:b] for a, b in matching_ranges(item)) if args.format == 'json': dumps = json.JSONEncoder(ensure_ascii=False, default=json_default).encode def show(lineno, items): if only_matching == 'part': items = [filter_item(unicode(item)) for item in items] if only_matching == 'columns': d = { k: v for k, v in zip(used_columns, items) if filter_item(unicode(v)) } else: d = dict(zip(used_columns, items)) if args.show_lineno: prefix['lineno'] = lineno if prefix: prefix['data'] = d d = prefix return dumps(d).encode('utf-8', 'surrogatepass') + b'\n' else: def colour_item(item): pos = 0 parts = [] for a, b in matching_ranges(item): parts.extend( (item[pos:a], colour(item[a:b], 'grep/highlight'))) pos = b parts.append(item[pos:]) return ''.join(parts) def show(lineno, items): data = list(prefix) if args.show_lineno: data.append(unicode(lineno)) show_items = map(unicode, items) if only_matching: if only_matching == 'columns': show_items = (item if filter_item(item) else '' for item in show_items) else: show_items = map(filter_item, show_items) show_items = list(show_items) lens = (len(item) for item in data + show_items) if highlight_matches: show_items = list(map(colour_item, show_items)) if escape_item: lens_unesc = (len(item) for item in data + show_items) show_items = list(map(escape_item, show_items)) lens_esc = (len(item) for item in data + show_items) lens = ( l + esc - unesc for l, unesc, esc in zip(lens, lens_unesc, lens_esc)) data.extend(show_items) return separate(data, lens).encode('utf-8', errors) + b'\n' return show # This is called for each slice in each dataset. # Each slice has a separate process (the same for all datasets). # The first slice runs in the main process (unless -l), everything # else runs from one_slice. def grep(ds, sliceno, out): out.start(ds) if len(patterns) == 1: chk = patterns[0].search else: def chk(s): return any(p.search(s) for p in patterns) first = [True] def mk_iter(col): kw = {} if first[0]: first[0] = False lines = ds.lines[sliceno] if lines > status_interval[sliceno]: def cb(n): q_status.put((sliceno, False)) out.excite() kw['callback'] = cb kw['callback_interval'] = status_interval[sliceno] if ds.columns[col].type == 'ascii': kw['_type'] = 'unicode' it = ds._column_iterator(sliceno, col, **kw) if ds.columns[col].type == 'bytes': errors = 'replace' if PY2 else 'surrogateescape' if ds.columns[col].none_support: it = (None if v is None else v.decode('utf-8', errors) for v in it) else: it = (v.decode('utf-8', errors) for v in it) return it used_columns = columns_for_ds(ds) used_grep_columns = grep_columns and columns_for_ds(ds, grep_columns) if grep_columns and set(used_grep_columns) != set(used_columns): grep_iter = izip(*(mk_iter(col) for col in used_grep_columns)) else: grep_iter = repeat(None) lines_iter = izip(*(mk_iter(col) for col in used_columns)) if args.before_context: before = deque((), args.before_context) else: before = None if args.format == 'json': prefix = {} if args.show_dataset: prefix['dataset'] = ds if args.show_sliceno: prefix['sliceno'] = sliceno show = make_show(prefix, used_columns) else: prefix = [] if args.show_dataset: prefix.append(ds) if args.show_sliceno: prefix.append(str(sliceno)) prefix = tuple(prefix) show = make_show(prefix, used_columns) if args.invert_match: maybe_invert = operator.not_ else: maybe_invert = bool to_show = 0 for lineno, (grep_items, items) in enumerate(izip(grep_iter, lines_iter)): if maybe_invert( any(chk(unicode(item)) for item in grep_items or items)): if q_list: q_list.put((ds, sliceno)) return while before: out.put(show(*before.popleft())) to_show = 1 + args.after_context if to_show: out.put(show(lineno, items)) to_show -= 1 elif before is not None: before.append((lineno, items)) out.end(ds) # This runs in a separate process for each slice except the first # one (unless -l), which is handled specially in the main process. def one_slice(sliceno, q_in, q_out, q_to_close): if q_to_close: q_to_close.close() if q_in: q_in.make_reader() if q_out: q_out.make_writer() if q_list: q_list.make_writer() try: out = outputter(q_in, q_out) for ds in datasets: if seen_list is None or ds not in seen_list: grep(ds, sliceno, out) q_status.put((sliceno, True)) out.finish() except QueueEmpty: # some other process died, no need to print an error here sys.exit(1) headers_prefix = [] if args.show_dataset: headers_prefix.append('[DATASET]') if args.show_sliceno: headers_prefix.append('[SLICE]') if args.show_lineno: headers_prefix.append('[LINE]') # {ds: headers} for each ds where headers change (not including the first). # this is every ds where sync between slices has to happen when not --ordered. headers = OrderedDict() if args.headers: current_headers = None for ds in datasets: candidate_headers = columns_for_ds(ds) if candidate_headers != current_headers: headers[ds] = current_headers = candidate_headers def gen_headers(headers): show_items = headers_prefix + headers if escape_item: show_items = list(map(escape_item, show_items)) coloured = (colour(item, 'grep/header') for item in show_items) txt = separate(coloured, map(len, show_items)) return txt.encode('utf-8', 'surrogatepass') + b'\n' # remove the starting ds, so no header changes means no special handling. current_headers = headers.pop(datasets[0]) if not args.list_matching: write(1, gen_headers(current_headers)) headers_iter = iter(map(gen_headers, headers.values())) q_in = q_out = first_q_out = q_to_close = q_list = None children = [status_process] seen_list = None if args.list_matching: # in this case all slices get their own process # and the main process just prints the maching slices q_list = mp.LockFreeQueue() separate_process_slices = want_slices if not args.show_sliceno: seen_list = mp.MpSet() else: separate_process_slices = want_slices[1:] if args.ordered or headers: # needs to sync in some way q_in = first_q_out = mp.LockFreeQueue() for sliceno in separate_process_slices: if q_in: q_out = mp.LockFreeQueue() p = mp.SimplifiedProcess( target=one_slice, args=( sliceno, q_in, q_out, q_to_close, ), name='slice-%d' % (sliceno, ), ) children.append(p) if q_in and q_in is not first_q_out: q_in.close() q_to_close = first_q_out q_in = q_out if q_in: q_out = first_q_out q_in.make_reader() q_out.make_writer() if args.ordered: q_in.put_local(None) del q_to_close del first_q_out try: if args.list_matching: if args.headers: headers_prefix = ['[DATASET]'] if seen_list is None: headers_prefix.append('[SLICE]') write(1, gen_headers([])) ordered_res = defaultdict(set) q_list.make_reader() if seen_list is None: used_columns = ['dataset', 'sliceno'] else: used_columns = ['dataset'] inner_show = make_show({} if args.format == 'json' else [], used_columns) def show(ds, sliceno=None): if sliceno is None: items = [ds] else: items = [ds, sliceno] write(1, inner_show(None, items)) while True: try: ds, sliceno = q_list.get() except QueueEmpty: break if seen_list is None: if args.ordered: ordered_res[ds].add(sliceno) else: show(ds, sliceno) elif ds not in seen_list: seen_list.add(ds) if not args.ordered: show(ds) if args.ordered: for ds in datasets: if seen_list is None: for sliceno in sorted(ordered_res[ds]): show(ds, sliceno) else: if ds in seen_list: show(ds) else: out = outputter(q_in, q_out, first_slice=True) sliceno = want_slices[0] for ds in datasets: grep(ds, sliceno, out) q_status.put((sliceno, True)) out.finish() except QueueEmpty: # don't print an error, probably a subprocess died from EPIPE before # the main process. (or the subprocess already printed an error.) return 1 q_status.close() for c in children: c.join() if c.exitcode: return 1
def main(): # Several commands use SIGUSR1 which (naturally...) defaults to killing the # process, so start by blocking that to minimise the race time. if hasattr(signal, 'pthread_sigmask'): signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGUSR1}) else: # Or if we can't block it, just ignore it. signal.signal(signal.SIGUSR1, signal.SIG_IGN) # As of python 3.8 the default start_method is 'spawn' on macOS. # This doesn't work for us. 'fork' is fairly unsafe on macOS, # but it's better than not working at all. See # https://bugs.python.org/issue33725 # for more information. import multiprocessing if hasattr(multiprocessing, 'set_start_method'): # If possible, make the forkserver (used by database updates) pre-import everthing if hasattr(multiprocessing, 'set_forkserver_preload'): multiprocessing.set_forkserver_preload( ['accelerator', 'accelerator.server']) multiprocessing.set_start_method('fork') from accelerator import g g.running = 'shell' from accelerator.autoflush import AutoFlush main_argv, argv = split_args(sys.argv[1:]) sys.stdout = AutoFlush(sys.stdout) sys.stderr = AutoFlush(sys.stderr) # configuration defaults aliases = { 'cat': 'grep -e ""', } colour_d = { 'warning': ('RED', ), 'highlight': ('BOLD', ), 'grep/highlight': ('RED', ), 'info': ('BRIGHTBLUE', ), 'infohighlight': ( 'BOLD', 'BRIGHTBLUE', ), 'separator': ( 'CYAN', 'UNDERLINE', ), 'header': ( 'BRIGHTBLUE', 'BOLD', ), } parse_user_config(aliases, colour_d) colour._names.update(colour_d) used_aliases = [] while argv and argv[0] in aliases: alias = argv[0] if alias == 'noalias': # save the user from itself break try: expanded = shlex.split(aliases[alias]) except ValueError as e: raise ValueError('Failed to expand alias %s (%r): %s' % ( argv[0], aliases[argv[0]], e, )) more_main_argv, argv = split_args(expanded + argv[1:]) main_argv.extend(more_main_argv) if expanded and alias == expanded[0]: break used_aliases.append(alias) if alias in used_aliases[:-1]: raise ValueError('Alias loop: %r' % (used_aliases, )) while argv and argv[0] == 'noalias': argv.pop(0) epilog = ['commands:', ''] cmdlen = max(len(cmd) for cmd in COMMANDS) template = ' %%%ds %%s' % (cmdlen, ) for cmd, func in sorted(COMMANDS.items()): epilog.append(template % ( cmd, func.help, )) epilog.append('') epilog.append('aliases:') epilog.extend(' %s = %s' % item for item in sorted(aliases.items())) epilog.append('') epilog.append('use "' + colour('%(prog)s <command> --help', 'help/highlight') + '" for <command> usage') epilog.append('try "' + colour('%(prog)s intro', 'help/highlight') + '" for an introduction') parser = ArgumentParser( usage='%(prog)s [--config CONFIG_FILE] command [args]', epilog='\n'.join(epilog), formatter_class=RawDescriptionHelpFormatter, ) parser.add_argument('--config', metavar='CONFIG_FILE', help='configuration file') parser.add_argument('--version', action='store_true', help='alias for the version command') args = parser.parse_args(main_argv) if args.version: sys.exit(cmd_version(())) args.command = argv.pop(0) if argv else None if args.command not in COMMANDS: parser.print_help(file=sys.stderr) if args.command is not None: print(file=sys.stderr) print('Unknown command "%s"' % (args.command, ), file=sys.stderr) sys.exit(2) config_fn = args.config if args.command in ( 'init', 'intro', 'version', ): config_fn = False cmd = COMMANDS[args.command] debug_cmd = getattr(cmd, 'is_debug', False) try: setup(config_fn, debug_cmd) argv.insert(0, '%s %s' % ( basename(sys.argv[0]), args.command, )) return cmd(argv) except UserError as e: print(e, file=sys.stderr) return 1 except OSError as e: if e.errno == errno.EPIPE: return 1 else: raise except KeyboardInterrupt: # Exiting with KeyboardInterrupt causes python to print a traceback. # We don't want that, but we do want to exit from SIGINT (so the # calling process can know that happened). signal.signal(signal.SIGINT, signal.SIG_DFL) os.kill(os.getpid(), signal.SIGINT) # If that didn't work let's re-raise the KeyboardInterrupt. raise