def get_possible_actions( self, parser: argparse.ArgumentParser, arg_strings: List[str] ) -> Tuple[List[argparse.Action], List[argparse.Action]]: possible = [] if self._comp_data["comprep_prefix"] != "": arg_strings[:] = arg_strings[:-1] positionals = self.get_positional_actions(parser) arg_pat, opt_idx = self.get_args_pattern(arg_strings) if len(opt_idx) == 0: last_opt_idx = -1 else: last_opt_idx = max(opt_idx) cursor = 0 arg_end = len(arg_pat) # consume all optional actions opts = { o.dest: o for o in parser._get_optional_actions() if o.help != argparse.SUPPRESS } if "init_completion" in opts: opts.pop("init_completion") while cursor <= last_opt_idx: next_opt_idx = min([i for i in opt_idx if cursor <= i]) if cursor == next_opt_idx: opt = opt_idx[next_opt_idx][0] opt_pat = self._get_nargs_pattern(opt) match = re.match(opt_pat, arg_pat[cursor + 1:]) opts.pop(opt.dest) if match is None: if next_opt_idx == last_opt_idx: possible.append(opt) return possible, [] else: raise ValueError cursor += match.end() + 1 if cursor == arg_end and opt.nargs in ["*", "+"]: possible.append(opt) else: for i in range(len(positionals), 0, -1): pos_pat = "".join( self._get_nargs_pattern(x) for x in positionals[:i]) match = re.match(pos_pat, arg_pat[cursor:]) pos_action = positionals[i - 1] if match is None: continue else: if isinstance(pos_action, _SubParsersAction): sub_span = match.span(i) subcommand = arg_strings[cursor + sub_span[0]] if subcommand in pos_action.choices: return self.get_possible_actions( pos_action.choices[subcommand], arg_strings[cursor + sub_span[0] + 1:]) else: raise ValueError cursor += match.end() positionals[:] = positionals[i:] break possible_opts = [o for _, o in opts.items()] # consume rest of postional actions for i in range(1, len(positionals) + 1): pos_pat = "".join( self._get_nargs_pattern(x) for x in positionals[:i]) match = re.match(pos_pat, arg_pat[cursor:]) pos_action = positionals[i - 1] if match is None: possible.append(pos_action) break elif cursor + match.end() == arg_end: min_nargs = self.get_min_nargs(positionals[:i]) if pos_action.nargs in ["*", "+"]: possible.append(pos_action) # if match.end() - match.start() < min_nargs: if min_nargs < match.end() - match.start(): possible.append(pos_action) if isinstance(pos_action, _SubParsersAction): sub_span = match.span(i) subcommand = arg_strings[cursor + sub_span[0]] if subcommand in pos_action.choices: return self.get_possible_actions( pos_action.choices[subcommand], arg_strings[cursor + sub_span[0] + 1:]) return possible, possible_opts
class MRJobLauncher(object): """Handle running a MapReduce job on an executable from the command line. This class will eventually support running arbitrary executables; for now it only supports :py:class:`~mrjob.job.MRJob` subclasses. Up to v0.5 it is effectively part of the :py:class:`~mrjob.job.MRJob` class itself and should not be used externally in any way. """ # only MRJobLauncher expects the first argument to be script_path _FIRST_ARG_IS_SCRIPT_PATH = True def __init__(self, script_path=None, args=None, from_cl=False): """ :param script_path: Path to script unless it's the first item of *args* :param args: Command line arguments :param from_cl: If not using sys.argv but still comming from the command line (as opposed to a script, e.g. from mrjob.cmd), don't override the option parser error function (exit instead of throwing ValueError). """ if script_path is not None: script_path = os.path.abspath(script_path) self._script_path = script_path # make sure we respect the $TZ (time zone) environment variable if hasattr(time, 'tzset'): time.tzset() # argument dests for args to pass through self._passthru_arg_dests = set() self._file_arg_dests = set() # there is no equivalent in argparse # remove this in v0.7.0 if hasattr(self, 'OPTION_CLASS'): log.warning('OPTION_CLASS attribute is ignored; ' 'mrjob now uses argparse instead of optparse') self.arg_parser = ArgumentParser(usage=self._usage(), add_help=False) self.configure_args() if (_im_func(self.configure_options) != _im_func( MRJobLauncher.configure_options)): log.warning('configure_options() is deprecated and will be' ' removed in v0.7.0; please use configure_args()' ' instead.') self.configure_options() # don't pass None to parse_args unless we're actually running # the MRJob script if args is _READ_ARGS_FROM_SYS_ARGV: self._cl_args = sys.argv[1:] else: # don't pass sys.argv to self.arg_parser, and have it # raise an exception on error rather than printing to stderr # and exiting. self._cl_args = args or [] def error(msg): raise ValueError(msg) if not from_cl: self.arg_parser.error = error self.load_args(self._cl_args) if (_im_func(self.load_options) != _im_func( MRJobLauncher.load_options)): log.warning('load_options() is deprecated and will be' ' removed in v0.7.0; please use load_args()' ' instead.') self.load_options(self._cl_args) # Make it possible to redirect stdin, stdout, and stderr, for testing # See sandbox(), below. # # These should always read/write bytes, not unicode. Generally, # on Python 2, sys.std* can read and write bytes, whereas on Python 3, # you need to use sys.std*.buffer (which doesn't exist on Python 2). # # However, certain Python 3 environments, such as Jupyter notebook, # act more like Python 2. See #1441. self.stdin = getattr(sys.stdin, 'buffer', sys.stdin) self.stdout = getattr(sys.stdout, 'buffer', sys.stdout) self.stderr = getattr(sys.stderr, 'buffer', sys.stderr) @classmethod def _usage(cls): """Command line usage string for this class""" return ("usage: mrjob run [script path|executable path|--help]" " [options]") def _print_help(self, options): """Print help for this job. This will either print runner or basic help. Override to allow other kinds of help.""" if options.runner: _print_help_for_runner(self._runner_opt_names(), options.deprecated) else: _print_basic_help(self.arg_parser, self._usage(), options.deprecated) @classmethod def run(cls, args=_READ_ARGS_FROM_SYS_ARGV): """Entry point for running job from the command-line. This is also the entry point when a mapper or reducer is run by Hadoop Streaming. Does one of: * Print step information (:option:`--steps`). See :py:meth:`show_steps` * Run a mapper (:option:`--mapper`). See :py:meth:`run_mapper` * Run a combiner (:option:`--combiner`). See :py:meth:`run_combiner` * Run a reducer (:option:`--reducer`). See :py:meth:`run_reducer` * Run the entire job. See :py:meth:`run_job` """ # load options from the command line launcher = cls(args=args) launcher.run_job() def execute(self): # Launcher only runs jobs, doesn't do any Hadoop Streaming stuff self.run_job() def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ return self._runner_class()(**self._runner_kwargs()) @classmethod def set_up_logging(cls, quiet=False, verbose=False, stream=None): """Set up logging when running from the command line. This is also used by the various command-line utilities. :param bool quiet: If true, don't log. Overrides *verbose*. :param bool verbose: If true, set log level to ``DEBUG`` (default is ``INFO``) :param bool stream: Stream to log to (default is ``sys.stderr``) This will also set up a null log handler for boto3, so we don't get warnings if boto3 tries to log about throttling and whatnot. """ if quiet: log_to_null(name='mrjob') log_to_null(name='__main__') else: log_to_stream(name='mrjob', debug=verbose, stream=stream) log_to_stream(name='__main__', debug=verbose, stream=stream) def run_job(self): """Run the all steps of the job, logging errors (and debugging output if :option:`--verbose` is specified) to STDERR and streaming the output to STDOUT. Called from :py:meth:`run`. You'd probably only want to call this directly from automated tests. """ # self.stderr is strictly binary, need to wrap it so it's possible # to log to it in Python 3 log_stream = codecs.getwriter('utf_8')(self.stderr) self.set_up_logging(quiet=self.options.quiet, verbose=self.options.verbose, stream=log_stream) with self.make_runner() as runner: try: runner.run() except StepFailedException as e: # no need for a runner stacktrace if step failed; runners will # log more useful information anyway log.error(str(e)) sys.exit(1) if not self.options.no_output: for chunk in runner.cat_output(): self.stdout.write(chunk) self.stdout.flush() ### Command-line arguments ### def configure_args(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Re-define to define custom command-line arguments or pass through existing ones:: def configure_args(self): super(MRYourJob, self).configure_args() self.add_passthru_arg(...) self.add_file_arg(...) self.pass_arg_through(...) ... """ self.arg_parser.add_argument('-h', '--help', dest='help', action='store_true', help='show this message and exit') self.arg_parser.add_argument( '--deprecated', dest='deprecated', action='store_true', help='include help for deprecated options') # if script path isn't set, expect it on the command line if self._FIRST_ARG_IS_SCRIPT_PATH: self.arg_parser.add_argument(dest='script_path', help='path of script to launch') self.arg_parser.add_argument( dest='args', nargs='*', help=('input paths to read (or stdin if not set). If --spark' ' is set, the input and output path for the spark job.')) _add_basic_args(self.arg_parser) _add_job_args(self.arg_parser) _add_runner_args(self.arg_parser) def load_args(self, args): """Load command-line options into ``self.options`` and ``self._script_path``. Called from :py:meth:`__init__()` after :py:meth:`configure_args`. :type args: list of str :param args: a list of command line arguments. ``None`` will be treated the same as ``[]``. Re-define if you want to post-process command-line arguments:: def load_args(self, args): super(MRYourJob, self).load_args(args) self.stop_words = self.options.stop_words.split(',') ... """ self.options = self.arg_parser.parse_args(args) if self.options.help: self._print_help(self.options) sys.exit(0) if self._FIRST_ARG_IS_SCRIPT_PATH: # should always be set, just hedging self._script_path = self.options.script_path def add_file_arg(self, *args, **kwargs): """Add a command-line option that sends an external file (e.g. a SQLite DB) to Hadoop:: def configure_options(self): super(MRYourJob, self).configure_options() self.add_file_option('--scoring-db', help=...) This does the right thing: the file will be uploaded to the working dir of the script on Hadoop, and the script will be passed the same option, but with the local name of the file in the script's working directory. We suggest against sending Berkeley DBs to your job, as Berkeley DB is not forwards-compatible (so a Berkeley DB that you construct on your computer may not be readable from within Hadoop). Use SQLite databases instead. If all you need is an on-disk hash table, try out the :py:mod:`sqlite3dbm` module. """ if kwargs.get('type') not in (None, 'string'): raise ArgumentError('file options must take strings') if kwargs.get('action') not in (None, 'append', 'store'): raise ArgumentError( "file options must use the actions 'store' or 'append'") pass_opt = self.arg_parser.add_argument(*args, **kwargs) self._file_arg_dests.add(pass_opt.dest) def add_passthru_arg(self, *args, **kwargs): """Function to create options which both the job runner and the job itself respect (we use this for protocols, for example). Use it like you would use :py:func:`argparse.ArgumentParser.add_argument`:: def configure_args(self): super(MRYourJob, self).configure_args() self.add_passthru_arg( '--max-ngram-size', type='int', default=4, help='...') If you want to pass files through to the mapper/reducer, use :py:meth:`add_file_arg` instead. If you want to pass through a built-in option (e.g. ``--runner``, use :py:meth:`pass_arg_through` instead. """ pass_opt = self.arg_parser.add_argument(*args, **kwargs) self._passthru_arg_dests.add(pass_opt.dest) def pass_arg_through(self, opt_str): """Pass the given argument through to the job.""" # _get_optional_actions() is hidden but the interface appears # to be stable, and theres no non-hidden interface for action in self.arg_parser._get_optional_actions(): if opt_str in action.option_strings or opt_str == action.dest: self._passthru_arg_dests.add(action.dest) break else: raise ValueError('unknown arg: %s', opt_str) def is_task(self): """True if this is a mapper, combiner, or reducer. This is mostly useful inside :py:meth:`load_args`, to disable loading args when we aren't running inside Hadoop Streaming. """ return False ### old optparse shims ### @property def args(self): class_name = self.__class__.__name__ log.warning( '%s.args is a deprecated alias for %s.options.args, and will' ' be removed in v0.7.0' % (class_name, class_name)) return self.options def configure_options(self): """.. deprecated:: 0.6.0 Use `:py:meth:`configure_args` instead. """ pass # deprecation warning is in __init__() def load_options(self, args): """.. deprecated:: 0.6.0 Use `:py:meth:`load_args` instead. """ pass # deprecation warning is in __init__() def add_file_option(self, *args, **kwargs): """.. deprecated:: 0.6.0 Like :py:meth:`add_file_arg` except that it emulates the old :py:mod:`optparse` interface (which is almost identical). """ log.warning('add_file_option() is deprecated and will be removed in' ' v0.7.0. Use add_file_arg() instead.') self.add_file_arg(*args, **_optparse_kwargs_to_argparse(**kwargs)) def add_passthrough_option(self, *args, **kwargs): """.. deprecated:: 0.6.0 Like :py:meth:`add_passthru_arg` except that it emulates the old :py:mod:`optparse` interface (which is almost identical). """ log.warning( 'add_passthrough_option() is deprecated and will be removed in' ' v0.7.0. Use add_passthru_arg() instead.') self.add_passthru_arg(*args, **_optparse_kwargs_to_argparse(**kwargs)) def pass_through_option(self, opt_str): """.. deprecated:: 0.6.0 Like :py:meth:`pass_arg_througj` except that it emulates the old :py:mod:`optparse` interface (which is almost identical). """ log.warning( 'pass_through_option() is deprecated and will be removed in' ' v0.7.0. Use pass_arg_through() instead.') self.pass_arg_through(opt_str) ### runners ### def _runner_class(self): """Runner class, as indicated by ``--runner``. This uses conditional imports to avoid importing runner modules that we don't need (and may not have libraries for). Defaults to ``'local'`` and disallows use of inline runner. """ if self.options.runner == 'dataproc': from mrjob.dataproc import DataprocJobRunner return DataprocJobRunner elif self.options.runner == 'emr': from mrjob.emr import EMRJobRunner return EMRJobRunner elif self.options.runner == 'hadoop': from mrjob.hadoop import HadoopJobRunner return HadoopJobRunner elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default from mrjob.local import LocalMRJobRunner return LocalMRJobRunner def _runner_kwargs(self): return combine_dicts( self._non_option_kwargs(), self._kwargs_from_switches(self._runner_opt_names()), self._job_kwargs(), ) def _runner_opt_names(self): return self._runner_class().OPT_NAMES def _non_option_kwargs(self): """Keyword arguments to runner constructor that can't be set in mrjob.conf. These should match the (named) arguments to :py:meth:`~mrjob.runner.MRJobRunner.__init__`. """ # build extra_args and file_upload_args # # TODO: deprecate file_upload_args, represent paths to upload # as dictionaries in extra_args raw_args = _parse_raw_args(self.arg_parser, self._cl_args) extra_args = [] file_upload_args = [] for dest, option_string, args in raw_args: if dest in self._passthru_arg_dests: # special case for --hadoop-arg=-verbose etc. if (option_string and len(args) == 1 and args[0].startswith('-')): extra_args.append('%s=%s' % (option_string, args[0])) else: if option_string: extra_args.append(option_string) extra_args.extend(args) elif dest in self._file_arg_dests: file_upload_args.append((option_string, args[0])) return dict( conf_paths=self.options.conf_paths, extra_args=extra_args, file_upload_args=file_upload_args, hadoop_input_format=self.hadoop_input_format(), hadoop_output_format=self.hadoop_output_format(), input_paths=self.options.args, mr_job_script=self._script_path, output_dir=self.options.output_dir, partitioner=self.partitioner(), stdin=self.stdin, step_output_dir=self.options.step_output_dir, ) def _kwargs_from_switches(self, keys): return dict((key, getattr(self.options, key)) for key in keys if hasattr(self.options, key)) def _job_kwargs(self): """Keyword arguments to the runner class that can be specified by the job/launcher itself.""" return dict( jobconf=self.jobconf(), libjars=self.libjars(), partitioner=self.partitioner(), sort_values=self.sort_values(), ) ### Hooks for options defined by the job ### def hadoop_input_format(self): """See :py:meth:`mrjob.job.MRJob.hadoop_input_format`.""" return None def hadoop_output_format(self): """See :py:meth:`mrjob.job.MRJob.hadoop_output_format`.""" return None def jobconf(self): """See :py:meth:`mrjob.job.MRJob.jobconf`.""" return {} def libjars(self): """See :py:meth:`mrjob.job.MRJob.libjars`.""" return [] def partitioner(self): """See :py:meth:`mrjob.job.MRJob.partitioner`.""" return None def sort_values(self): """See :py:meth:`mrjob.job.MRJob.sort_values`.""" return None ### Testing ### def sandbox(self, stdin=None, stdout=None, stderr=None): """Redirect stdin, stdout, and stderr for automated testing. You can set stdin, stdout, and stderr to file objects. By default, they'll be set to empty ``BytesIO`` objects. You can then access the job's file handles through ``self.stdin``, ``self.stdout``, and ``self.stderr``. See :ref:`testing` for more information about testing. You may call sandbox multiple times (this will essentially clear the file handles). ``stdin`` is empty by default. You can set it to anything that yields lines:: mr_job.sandbox(stdin=BytesIO(b'some_data\\n')) or, equivalently:: mr_job.sandbox(stdin=[b'some_data\\n']) For convenience, this sandbox() returns self, so you can do:: mr_job = MRJobClassToTest().sandbox() Simple testing example:: mr_job = MRYourJob.sandbox() self.assertEqual(list(mr_job.reducer('foo', ['a', 'b'])), [...]) More complex testing example:: from BytesIO import BytesIO from mrjob.parse import parse_mr_job_stderr from mrjob.protocol import JSONProtocol mr_job = MRYourJob(args=[...]) fake_input = '"foo"\\t"bar"\\n"foo"\\t"baz"\\n' mr_job.sandbox(stdin=BytesIO(fake_input)) mr_job.run_reducer(link_num=0) self.assertEqual(mrjob.stdout.getvalue(), ...) self.assertEqual(parse_mr_job_stderr(mr_job.stderr), ...) """ self.stdin = stdin or BytesIO() self.stdout = stdout or BytesIO() self.stderr = stderr or BytesIO() return self