def _make_arg_parser(): # this parser is never used for help messages, but # will show usage on error parser = ArgumentParser(usage=_USAGE, add_help=False) # add positional arguments parser.add_argument(dest='script_or_jar', nargs='?') parser.add_argument(dest='args', nargs='*') _add_basic_args(parser) _add_runner_alias_arg(parser) _add_help_arg(parser) _add_deprecated_arg(parser) # add runner opts runner_opt_names = set(_RUNNER_OPTS) - set(_HARD_CODED_OPTS) _add_runner_args(parser, opt_names=runner_opt_names) # add spark-specific opts (without colliding with runner opts) for opt_name, switch in _SPARK_SUBMIT_SWITCHES.items(): if opt_name in _RUNNER_OPTS and switch not in _SWITCH_ALIASES: continue _add_spark_submit_arg(parser, opt_name) return parser
def _make_arg_parser(): usage = '%(prog)s report-long-jobs [options]' description = ('Report jobs running for more than a certain number of' ' hours (by default, %.1f). This can help catch buggy jobs' ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS) arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--min-hours', dest='min_hours', type=float, default=DEFAULT_MIN_HOURS, help=('Minimum number of hours a job can run before we report it.' ' Default: %(default)s')) arg_parser.add_argument( '-x', '--exclude', action='append', help=('Exclude clusters that match the specified tags.' ' Specifed in the form TAG_KEY,TAG_VALUE.') ) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect') ) _alphabetize_actions(arg_parser) return arg_parser
def _print_help_for_runner(runner_class, include_deprecated=False): help_parser = ArgumentParser(usage=SUPPRESS, add_help=False) arg_group = help_parser.add_argument_group( 'optional arguments for %s runner' % runner_class.alias) # don't include hard-coded opts or opts in basic help opt_names = runner_class.OPT_NAMES - set(_HARD_CODED_OPTS) # don't include switches already in basic help suppress_switches = set(_SPARK_SUBMIT_SWITCHES.values()) # simplify description of aliases of switches in basic help customize_switches = { v: dict(help='Alias for %s' % k) for k, v in _SWITCH_ALIASES.items() } _add_runner_args(arg_group, opt_names, include_deprecated=include_deprecated, customize_switches=customize_switches, suppress_switches=suppress_switches) help_parser.print_help()
def configure_args(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Re-define to define custom command-line arguments or pass through existing ones:: def configure_args(self): super(MRYourJob, self).configure_args() self.add_passthru_arg(...) self.add_file_arg(...) self.pass_arg_through(...) ... """ # if script path isn't set, expect it on the command line if self._FIRST_ARG_IS_SCRIPT_PATH: self.arg_parser.add_argument( dest='script_path', help='path of script to launch') self.arg_parser.add_argument( dest='args', nargs='*', help=('input paths to read (or stdin if not set). If --spark' ' is set, the input and output path for the spark job.')) _add_basic_args(self.arg_parser) _add_job_args(self.arg_parser) _add_runner_args(self.arg_parser)
def _make_arg_parser(): usage = '%(prog)s s3-tmpwatch [options] TIME_UNTOUCHED URI [URI ...]' description = ( 'Delete all files at one or more URIs that are older than a' ' specified time.') arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") arg_parser.add_argument( dest='time_untouched', help='The time threshold for removing' ' files. A number with an optional' ' single-character suffix specifying the units: m for minutes, h for' ' hours, d for days. If no suffix is specified, time is in hours.') arg_parser.add_argument( dest='uris', nargs='+', help='s3:// URIs specifying where to delete old files') _add_basic_args(arg_parser) _add_runner_args( arg_parser, set(['region', 's3_endpoint']), ) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): description = ('Delete all files at one or more URIs that are older than a' ' specified time.') arg_parser = ArgumentParser(description=description) arg_parser.add_argument( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") arg_parser.add_argument( dest='time_untouched', help='The time threshold for removing' ' files. A number with an optional' ' single-character suffix specifying the units: m for minutes, h for' ' hours, d for days. If no suffix is specified, time is in hours.') arg_parser.add_argument( dest='uris', nargs='+', help='s3:// URIs specifying where to delete old files') _add_basic_args(arg_parser) _add_runner_args( arg_parser, set(['region', 's3_endpoint']), ) return arg_parser
def _make_arg_parser(): usage = '%(prog)s report-long-jobs [options]' description = ('Report jobs running for more than a certain number of' ' hours (by default, %.1f). This can help catch buggy jobs' ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS) arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--min-hours', dest='min_hours', type=float, default=DEFAULT_MIN_HOURS, help=('Minimum number of hours a job can run before we report it.' ' Default: %(default)s')) arg_parser.add_argument( '-x', '--exclude', action='append', help=('Exclude clusters that match the specified tags.' ' Specifed in the form TAG_KEY,TAG_VALUE.') ) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect') ) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): # this parser is never used for help messages, but # will show usage on error parser = ArgumentParser(usage=_USAGE, add_help=False) # add positional arguments parser.add_argument(dest='script_or_jar', nargs='?') parser.add_argument(dest='args', nargs='*') _add_basic_args(parser) _add_runner_alias_arg(parser) _add_help_arg(parser) _add_deprecated_arg(parser) # add runner opts runner_opt_names = set(_RUNNER_OPTS) - set(_HARD_CODED_OPTS) _add_runner_args(parser, opt_names=runner_opt_names) # add spark-specific opts (without colliding with runner opts) for opt_name, switch in _SPARK_SUBMIT_SWITCHES.items(): if opt_name in _RUNNER_OPTS and switch not in _SWITCH_ALIASES: continue _add_spark_submit_arg(parser, opt_name) return parser
def configure_args(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Re-define to define custom command-line arguments or pass through existing ones:: def configure_args(self): super(MRYourJob, self).configure_args() self.add_passthru_arg(...) self.add_file_arg(...) self.pass_arg_through(...) ... """ # if script path isn't set, expect it on the command line if self._FIRST_ARG_IS_SCRIPT_PATH: self.arg_parser.add_argument(dest='script_path', help='path of script to launch') self.arg_parser.add_argument( dest='args', nargs='*', help=('input paths to read (or stdin if not set). If --spark' ' is set, the input and output path for the spark job.')) _add_basic_args(self.arg_parser) _add_job_args(self.arg_parser) _add_runner_args(self.arg_parser)
def _make_arg_parser(): usage = '%(prog)s [options]' description = ('Terminate idle EMR clusters that meet the criteria' ' passed in on the command line (or, by default,' ' clusters that have been idle for one hour).') arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--max-hours-idle', dest='max_hours_idle', default=None, type='float', help=('Max number of hours a cluster can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) arg_parser.add_argument( '--max-mins-locked', dest='max_mins_locked', default=_DEFAULT_MAX_MINUTES_LOCKED, type='float', help='Max number of minutes a cluster can be locked while idle.') arg_parser.add_argument( '--mins-to-end-of-hour', dest='mins_to_end_of_hour', default=None, type='float', help=('Terminate clusters that are within this many minutes of' ' the end of a full hour since the job started running' ' AND have no pending steps.')) arg_parser.add_argument('--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled clusters') arg_parser.add_argument('--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled clusters') arg_parser.add_argument( '--pool-name', dest='pool_name', default=None, help='Only terminate clusters in the given named pool.') arg_parser.add_argument( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") _add_basic_args(arg_parser) _add_runner_args(arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) return arg_parser
def _make_arg_parser(): usage = '%(prog)s [options]' description = ('Terminate idle EMR clusters that meet the criteria' ' passed in on the command line (or, by default,' ' clusters that have been idle for one hour).') arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--max-hours-idle', dest='max_hours_idle', default=None, type=float, help=('Please use --max-mins-idle instead.')) arg_parser.add_argument( '--max-mins-idle', dest='max_mins_idle', default=None, type=float, help=('Max number of minutes a cluster can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) arg_parser.add_argument( '--max-mins-locked', dest='max_mins_locked', default=_DEFAULT_MAX_MINUTES_LOCKED, type=float, help='Max number of minutes a cluster can be locked while idle.') arg_parser.add_argument( '--mins-to-end-of-hour', dest='mins_to_end_of_hour', default=None, type=float, help=('Deprecated, does nothing.')) arg_parser.add_argument( '--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled clusters') arg_parser.add_argument( '--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled clusters') arg_parser.add_argument( '--pool-name', dest='pool_name', default=None, help='Only terminate clusters in the given named pool.') arg_parser.add_argument( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): usage = '%(prog)s create-cluster [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout.') arg_parser = ArgumentParser(usage=usage, description=description) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch')) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): usage = '%(prog)s create-cluster [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout.') arg_parser = ArgumentParser(usage=usage, description=description) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch')) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): usage = '%(prog)s [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob terminate-idle-clusters in your' ' crontab; clusters left idle can quickly become expensive!') arg_parser = ArgumentParser(usage=usage, description=description) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch')) return arg_parser
def _make_arg_parser(): usage = '%(prog)s [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob terminate-idle-clusters in your' ' crontab; clusters left idle can quickly become expensive!') arg_parser = ArgumentParser(usage=usage, description=description) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch')) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): usage = '%(prog)s terminate-idle-clusters [options]' description = ('Terminate idle EMR clusters that meet the criteria' ' passed in on the command line (or, by default,' ' clusters that have been idle for one hour).') arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--max-mins-idle', dest='max_mins_idle', default=None, type=float, help=('Max number of minutes a cluster can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) arg_parser.add_argument( '--max-mins-locked', dest='max_mins_locked', type=float, help='Deprecated, does nothing') arg_parser.add_argument( '--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled clusters') arg_parser.add_argument( '--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled clusters') arg_parser.add_argument( '--pool-name', dest='pool_name', default=None, help='Only terminate clusters in the given named pool.') arg_parser.add_argument( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): usage = '%(prog)s [options]' description = 'Print a giant report on EMR usage.' arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--max-days-ago', dest='max_days_ago', type=float, default=None, help=('Max number of days ago to look at jobs. By default, we go back' ' as far as EMR supports (currently about 2 months)')) _add_basic_args(arg_parser) _add_runner_args(arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) return arg_parser
def _make_arg_parser(): usage = '%(prog)s audit-emr-usage [options]' description = 'Print a giant report on EMR usage.' arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument( '--max-days-ago', dest='max_days_ago', type=float, default=None, help=('Max number of days ago to look at jobs. By default, we go back' ' as far as EMR supports (currently about 2 months)')) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): usage = '%(prog)s diagnose [opts] [--step-id STEP_ID] CLUSTER_ID' description = ('Get probable cause of failure for step on CLUSTER_ID.' ' By default we look at the last failed step') arg_parser = ArgumentParser(usage=usage, description=description) _add_basic_args(arg_parser) _add_runner_args(arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) arg_parser.add_argument(dest='cluster_id', help='ID of cluster with failed step') arg_parser.add_argument('--step-id', dest='step_id', help='ID of a particular failed step to diagnose') _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): description = 'Terminate an existing EMR cluster.' arg_parser = ArgumentParser(description=description) arg_parser.add_argument( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") arg_parser.add_argument( dest='cluster_id', help='ID of cluster to terminate') _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) return arg_parser
def main(cl_args=None): usage = 'usage: %(prog)s CLUSTER_ID [options] "command string"' description = ('Run a command on the master and all worker nodes of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") arg_parser.add_argument(dest='cluster_id', help='ID of cluster to run command on') arg_parser.add_argument(dest='cmd_string', help='command to run, as a single string') _add_basic_args(arg_parser) _add_runner_args( arg_parser, {'ec2_key_pair_file', 'ssh_bin'} | _filter_by_role( EMRJobRunner.OPT_NAMES, 'connect') ) _alphabetize_actions(arg_parser) options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('cluster_id', 'cmd_string', 'output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] cmd_args = shlex_split(options.cmd_string) output_dir = os.path.abspath(options.output_dir or options.cluster_id) with EMRJobRunner( cluster_id=options.cluster_id, **runner_kwargs) as runner: _run_on_all_nodes(runner, output_dir, cmd_args)
def _make_arg_parser(): usage = '%(prog)s diagnose [opts] [--step-id STEP_ID] CLUSTER_ID' description = ( 'Get probable cause of failure for step on CLUSTER_ID.' ' By default we look at the last failed step') arg_parser = ArgumentParser(usage=usage, description=description) _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) arg_parser.add_argument( dest='cluster_id', help='ID of cluster with failed step') arg_parser.add_argument( '--step-id', dest='step_id', help='ID of a particular failed step to diagnose') _alphabetize_actions(arg_parser) return arg_parser
def _make_arg_parser(): description = 'Terminate an existing EMR cluster.' arg_parser = ArgumentParser(description=description) arg_parser.add_argument( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") arg_parser.add_argument( dest='cluster_id', help='ID of cluster to terminate') _add_basic_args(arg_parser) _add_runner_args( arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')) _alphabetize_actions(arg_parser) return arg_parser
def _print_help_for_runner(runner_class, include_deprecated=False): help_parser = ArgumentParser(usage=SUPPRESS, add_help=False) arg_group = help_parser.add_argument_group( 'optional arguments for %s runner' % runner_class.alias) # don't include hard-coded opts or opts in basic help opt_names = runner_class.OPT_NAMES - set(_HARD_CODED_OPTS) # don't include switches already in basic help suppress_switches = set(_SPARK_SUBMIT_SWITCHES.values()) # simplify description of aliases of switches in basic help customize_switches = { v: dict(help='Alias for %s' % k) for k, v in _SWITCH_ALIASES.items() } _add_runner_args(arg_group, opt_names, include_deprecated=include_deprecated, customize_switches=customize_switches, suppress_switches=suppress_switches) help_parser.print_help()