Пример #1
0
def _make_arg_parser():
    usage = '%(prog)s report-long-jobs [options]'
    description = ('Report jobs running for more than a certain number of'
                   ' hours (by default, %.1f). This can help catch buggy jobs'
                   ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS)

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--min-hours', dest='min_hours', type=float,
        default=DEFAULT_MIN_HOURS,
        help=('Minimum number of hours a job can run before we report it.'
              ' Default: %(default)s'))

    arg_parser.add_argument(
        '-x', '--exclude', action='append',
        help=('Exclude clusters that match the specified tags.'
              ' Specifed in the form TAG_KEY,TAG_VALUE.')
    )

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')
    )

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #2
0
def _make_basic_help_parser(include_deprecated=False):
    """Make an arg parser that's used only for printing basic help.

    This prints help very similar to spark-submit itself. Runner args
    are not included unless they are also spark-submit args (e.g. --py-files)
    """
    help_parser = ArgumentParser(usage=_USAGE,
                                 description=_DESCRIPTION,
                                 epilog=_BASIC_HELP_EPILOG,
                                 add_help=False)

    _add_runner_alias_arg(help_parser)

    for group_desc, opt_names in _SPARK_SUBMIT_ARG_GROUPS:
        if group_desc is None:
            parser_or_group = help_parser
        else:
            parser_or_group = help_parser.add_argument_group(group_desc)

        for opt_name in opt_names:
            _add_spark_submit_arg(parser_or_group, opt_name)

        if group_desc is None:
            _add_basic_args(help_parser)
            _add_help_arg(help_parser)
            if include_deprecated:
                _add_deprecated_arg(help_parser)

    return help_parser
Пример #3
0
def _make_arg_parser():
    # this parser is never used for help messages, but
    # will show usage on error
    parser = ArgumentParser(usage=_USAGE, add_help=False)

    # add positional arguments
    parser.add_argument(dest='script_or_jar', nargs='?')
    parser.add_argument(dest='args', nargs='*')

    _add_basic_args(parser)
    _add_runner_alias_arg(parser)
    _add_help_arg(parser)
    _add_deprecated_arg(parser)

    # add runner opts
    runner_opt_names = set(_RUNNER_OPTS) - set(_HARD_CODED_OPTS)
    _add_runner_args(parser, opt_names=runner_opt_names)

    # add spark-specific opts (without colliding with runner opts)
    for opt_name, switch in _SPARK_SUBMIT_SWITCHES.items():
        if opt_name in _RUNNER_OPTS and switch not in _SWITCH_ALIASES:
            continue
        _add_spark_submit_arg(parser, opt_name)

    return parser
Пример #4
0
def _make_arg_parser():
    usage = '%(prog)s report-long-jobs [options]'
    description = ('Report jobs running for more than a certain number of'
                   ' hours (by default, %.1f). This can help catch buggy jobs'
                   ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS)

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--min-hours', dest='min_hours', type=float,
        default=DEFAULT_MIN_HOURS,
        help=('Minimum number of hours a job can run before we report it.'
              ' Default: %(default)s'))

    arg_parser.add_argument(
        '-x', '--exclude', action='append',
        help=('Exclude clusters that match the specified tags.'
              ' Specifed in the form TAG_KEY,TAG_VALUE.')
    )

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect')
    )

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #5
0
    def configure_args(self):
        """Define arguments for this script. Called from :py:meth:`__init__()`.

        Re-define to define custom command-line arguments or pass
        through existing ones::

            def configure_args(self):
                super(MRYourJob, self).configure_args()

                self.add_passthru_arg(...)
                self.add_file_arg(...)
                self.pass_arg_through(...)
                ...
        """
        # if script path isn't set, expect it on the command line
        if self._FIRST_ARG_IS_SCRIPT_PATH:
            self.arg_parser.add_argument(
                dest='script_path',
                help='path of script to launch')

        self.arg_parser.add_argument(
            dest='args', nargs='*',
            help=('input paths to read (or stdin if not set). If --spark'
                  ' is set, the input and output path for the spark job.'))

        _add_basic_args(self.arg_parser)
        _add_job_args(self.arg_parser)
        _add_runner_args(self.arg_parser)
Пример #6
0
def _make_arg_parser():
    usage = '%(prog)s s3-tmpwatch [options] TIME_UNTOUCHED URI [URI ...]'
    description = (
        'Delete all files at one or more URIs that are older than a'
        ' specified time.')

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    arg_parser.add_argument(
        dest='time_untouched',
        help='The time threshold for removing'
        ' files. A number with an optional'
        ' single-character suffix specifying the units: m for minutes, h for'
        ' hours, d for days. If no suffix is specified, time is in hours.')

    arg_parser.add_argument(
        dest='uris', nargs='+',
        help='s3:// URIs specifying where to delete old files')

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        set(['region', 's3_endpoint']),
    )

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #7
0
def _make_arg_parser():
    description = ('Delete all files at one or more URIs that are older than a'
                   ' specified time.')

    arg_parser = ArgumentParser(description=description)

    arg_parser.add_argument(
        '-t',
        '--test',
        dest='test',
        default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    arg_parser.add_argument(
        dest='time_untouched',
        help='The time threshold for removing'
        ' files. A number with an optional'
        ' single-character suffix specifying the units: m for minutes, h for'
        ' hours, d for days.  If no suffix is specified, time is in hours.')

    arg_parser.add_argument(
        dest='uris',
        nargs='+',
        help='s3:// URIs specifying where to delete old files')

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        set(['region', 's3_endpoint']),
    )

    return arg_parser
Пример #8
0
def _make_arg_parser():
    # this parser is never used for help messages, but
    # will show usage on error
    parser = ArgumentParser(usage=_USAGE, add_help=False)

    # add positional arguments
    parser.add_argument(dest='script_or_jar', nargs='?')
    parser.add_argument(dest='args', nargs='*')

    _add_basic_args(parser)
    _add_runner_alias_arg(parser)
    _add_help_arg(parser)
    _add_deprecated_arg(parser)

    # add runner opts
    runner_opt_names = set(_RUNNER_OPTS) - set(_HARD_CODED_OPTS)
    _add_runner_args(parser, opt_names=runner_opt_names)

    # add spark-specific opts (without colliding with runner opts)
    for opt_name, switch in _SPARK_SUBMIT_SWITCHES.items():
        if opt_name in _RUNNER_OPTS and switch not in _SWITCH_ALIASES:
            continue
        _add_spark_submit_arg(parser, opt_name)

    return parser
Пример #9
0
def _make_basic_help_parser(include_deprecated=False):
    """Make an arg parser that's used only for printing basic help.

    This prints help very similar to spark-submit itself. Runner args
    are not included unless they are also spark-submit args (e.g. --py-files)
    """
    help_parser = ArgumentParser(usage=_USAGE, description=_DESCRIPTION,
                                 epilog=_BASIC_HELP_EPILOG, add_help=False)

    _add_runner_alias_arg(help_parser)

    for group_desc, opt_names in _SPARK_SUBMIT_ARG_GROUPS:
        if group_desc is None:
            parser_or_group = help_parser
        else:
            parser_or_group = help_parser.add_argument_group(group_desc)

        for opt_name in opt_names:
            _add_spark_submit_arg(parser_or_group, opt_name)

        if group_desc is None:
            _add_basic_args(help_parser)
            _add_help_arg(help_parser)
            if include_deprecated:
                _add_deprecated_arg(help_parser)

    return help_parser
Пример #10
0
    def configure_args(self):
        """Define arguments for this script. Called from :py:meth:`__init__()`.

        Re-define to define custom command-line arguments or pass
        through existing ones::

            def configure_args(self):
                super(MRYourJob, self).configure_args()

                self.add_passthru_arg(...)
                self.add_file_arg(...)
                self.pass_arg_through(...)
                ...
        """
        # if script path isn't set, expect it on the command line
        if self._FIRST_ARG_IS_SCRIPT_PATH:
            self.arg_parser.add_argument(dest='script_path',
                                         help='path of script to launch')

        self.arg_parser.add_argument(
            dest='args',
            nargs='*',
            help=('input paths to read (or stdin if not set). If --spark'
                  ' is set, the input and output path for the spark job.'))

        _add_basic_args(self.arg_parser)
        _add_job_args(self.arg_parser)
        _add_runner_args(self.arg_parser)
Пример #11
0
def _make_arg_parser():
    usage = '%(prog)s [options]'
    description = ('Terminate idle EMR clusters that meet the criteria'
                   ' passed in on the command line (or, by default,'
                   ' clusters that have been idle for one hour).')

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--max-hours-idle',
        dest='max_hours_idle',
        default=None,
        type='float',
        help=('Max number of hours a cluster can go without bootstrapping,'
              ' running a step, or having a new step created. This will fire'
              ' even if there are pending steps which EMR has failed to'
              ' start. Make sure you set this higher than the amount of time'
              ' your jobs can take to start instances and bootstrap.'))
    arg_parser.add_argument(
        '--max-mins-locked',
        dest='max_mins_locked',
        default=_DEFAULT_MAX_MINUTES_LOCKED,
        type='float',
        help='Max number of minutes a cluster can be locked while idle.')
    arg_parser.add_argument(
        '--mins-to-end-of-hour',
        dest='mins_to_end_of_hour',
        default=None,
        type='float',
        help=('Terminate clusters that are within this many minutes of'
              ' the end of a full hour since the job started running'
              ' AND have no pending steps.'))
    arg_parser.add_argument('--unpooled-only',
                            dest='unpooled_only',
                            action='store_true',
                            default=False,
                            help='Only terminate un-pooled clusters')
    arg_parser.add_argument('--pooled-only',
                            dest='pooled_only',
                            action='store_true',
                            default=False,
                            help='Only terminate pooled clusters')
    arg_parser.add_argument(
        '--pool-name',
        dest='pool_name',
        default=None,
        help='Only terminate clusters in the given named pool.')
    arg_parser.add_argument(
        '--dry-run',
        dest='dry_run',
        default=False,
        action='store_true',
        help="Don't actually kill idle jobs; just log that we would")

    _add_basic_args(arg_parser)
    _add_runner_args(arg_parser,
                     _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    return arg_parser
Пример #12
0
def _make_arg_parser():
    usage = '%(prog)s [options]'
    description = ('Terminate idle EMR clusters that meet the criteria'
                   ' passed in on the command line (or, by default,'
                   ' clusters that have been idle for one hour).')

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--max-hours-idle', dest='max_hours_idle',
        default=None, type=float,
        help=('Please use --max-mins-idle instead.'))
    arg_parser.add_argument(
        '--max-mins-idle', dest='max_mins_idle',
        default=None, type=float,
        help=('Max number of minutes a cluster can go without bootstrapping,'
              ' running a step, or having a new step created. This will fire'
              ' even if there are pending steps which EMR has failed to'
              ' start. Make sure you set this higher than the amount of time'
              ' your jobs can take to start instances and bootstrap.'))
    arg_parser.add_argument(
        '--max-mins-locked', dest='max_mins_locked',
        default=_DEFAULT_MAX_MINUTES_LOCKED, type=float,
        help='Max number of minutes a cluster can be locked while idle.')
    arg_parser.add_argument(
        '--mins-to-end-of-hour', dest='mins_to_end_of_hour',
        default=None, type=float,
        help=('Deprecated, does nothing.'))
    arg_parser.add_argument(
        '--unpooled-only', dest='unpooled_only', action='store_true',
        default=False,
        help='Only terminate un-pooled clusters')
    arg_parser.add_argument(
        '--pooled-only', dest='pooled_only', action='store_true',
        default=False,
        help='Only terminate pooled clusters')
    arg_parser.add_argument(
        '--pool-name', dest='pool_name', default=None,
        help='Only terminate clusters in the given named pool.')
    arg_parser.add_argument(
        '--dry-run', dest='dry_run', default=False,
        action='store_true',
        help="Don't actually kill idle jobs; just log that we would")

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #13
0
def _make_arg_parser():
    usage = '%(prog)s [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout. WARNING: Do not run'
        ' this without mrjob terminate-idle-clusters in your'
        ' crontab; clusters left idle can quickly become expensive!')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch'))

    return arg_parser
Пример #14
0
def _make_arg_parser():
    usage = '%(prog)s create-cluster [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout.')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser, _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect',
                                    'launch'))

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #15
0
def _make_arg_parser():
    usage = '%(prog)s create-cluster [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout.')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch'))

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #16
0
def _make_arg_parser():
    usage = '%(prog)s [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout. WARNING: Do not run'
        ' this without mrjob terminate-idle-clusters in your'
        ' crontab; clusters left idle can quickly become expensive!')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect', 'launch'))

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #17
0
def _make_arg_parser():
    usage = '%(prog)s terminate-idle-clusters [options]'
    description = ('Terminate idle EMR clusters that meet the criteria'
                   ' passed in on the command line (or, by default,'
                   ' clusters that have been idle for one hour).')

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--max-mins-idle', dest='max_mins_idle',
        default=None, type=float,
        help=('Max number of minutes a cluster can go without bootstrapping,'
              ' running a step, or having a new step created. This will fire'
              ' even if there are pending steps which EMR has failed to'
              ' start. Make sure you set this higher than the amount of time'
              ' your jobs can take to start instances and bootstrap.'))
    arg_parser.add_argument(
        '--max-mins-locked', dest='max_mins_locked',
        type=float,
        help='Deprecated, does nothing')
    arg_parser.add_argument(
        '--unpooled-only', dest='unpooled_only', action='store_true',
        default=False,
        help='Only terminate un-pooled clusters')
    arg_parser.add_argument(
        '--pooled-only', dest='pooled_only', action='store_true',
        default=False,
        help='Only terminate pooled clusters')
    arg_parser.add_argument(
        '--pool-name', dest='pool_name', default=None,
        help='Only terminate clusters in the given named pool.')
    arg_parser.add_argument(
        '--dry-run', dest='dry_run', default=False,
        action='store_true',
        help="Don't actually kill idle jobs; just log that we would")

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #18
0
def _make_arg_parser():
    usage = '%(prog)s [options]'
    description = 'Print a giant report on EMR usage.'

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--max-days-ago',
        dest='max_days_ago',
        type=float,
        default=None,
        help=('Max number of days ago to look at jobs. By default, we go back'
              ' as far as EMR supports (currently about 2 months)'))

    _add_basic_args(arg_parser)
    _add_runner_args(arg_parser,
                     _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    return arg_parser
Пример #19
0
def _make_arg_parser():
    usage = '%(prog)s audit-emr-usage [options]'
    description = 'Print a giant report on EMR usage.'

    arg_parser = ArgumentParser(usage=usage, description=description)

    arg_parser.add_argument(
        '--max-days-ago', dest='max_days_ago', type=float, default=None,
        help=('Max number of days ago to look at jobs. By default, we go back'
              ' as far as EMR supports (currently about 2 months)'))

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #20
0
def _make_arg_parser():
    usage = '%(prog)s diagnose [opts] [--step-id STEP_ID] CLUSTER_ID'
    description = ('Get probable cause of failure for step on CLUSTER_ID.'
                   ' By default we look at the last failed step')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(arg_parser,
                     _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    arg_parser.add_argument(dest='cluster_id',
                            help='ID of cluster with failed step')
    arg_parser.add_argument('--step-id',
                            dest='step_id',
                            help='ID of a particular failed step to diagnose')

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #21
0
def _make_arg_parser():
    description = 'Terminate an existing EMR cluster.'

    arg_parser = ArgumentParser(description=description)

    arg_parser.add_argument(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    arg_parser.add_argument(
        dest='cluster_id',
        help='ID of cluster to terminate')

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    return arg_parser
Пример #22
0
def main(cl_args=None):
    usage = 'usage: %(prog)s CLUSTER_ID [options] "command string"'
    description = ('Run a command on the master and all worker nodes of an EMR'
                   ' cluster. Store stdout/stderr for results in OUTPUT_DIR.')

    arg_parser = ArgumentParser(usage=usage, description=description)
    arg_parser.add_argument('-o', '--output-dir', dest='output_dir',
                            default=None,
                            help="Specify an output directory (default:"
                            " CLUSTER_ID)")

    arg_parser.add_argument(dest='cluster_id',
                            help='ID of cluster to run command on')
    arg_parser.add_argument(dest='cmd_string',
                            help='command to run, as a single string')

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        {'ec2_key_pair_file', 'ssh_bin'} | _filter_by_role(
            EMRJobRunner.OPT_NAMES, 'connect')
    )

    _alphabetize_actions(arg_parser)

    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('cluster_id', 'cmd_string', 'output_dir',
                       'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    cmd_args = shlex_split(options.cmd_string)

    output_dir = os.path.abspath(options.output_dir or options.cluster_id)

    with EMRJobRunner(
            cluster_id=options.cluster_id, **runner_kwargs) as runner:
        _run_on_all_nodes(runner, output_dir, cmd_args)
Пример #23
0
def _make_arg_parser():
    usage = '%(prog)s diagnose [opts] [--step-id STEP_ID] CLUSTER_ID'
    description = (
        'Get probable cause of failure for step on CLUSTER_ID.'
        ' By default we look at the last failed step')
    arg_parser = ArgumentParser(usage=usage, description=description)

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    arg_parser.add_argument(
        dest='cluster_id',
        help='ID of cluster with failed step')
    arg_parser.add_argument(
        '--step-id', dest='step_id',
        help='ID of a particular failed step to diagnose')

    _alphabetize_actions(arg_parser)

    return arg_parser
Пример #24
0
def _make_arg_parser():
    description = 'Terminate an existing EMR cluster.'

    arg_parser = ArgumentParser(description=description)

    arg_parser.add_argument(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    arg_parser.add_argument(
        dest='cluster_id',
        help='ID of cluster to terminate')

    _add_basic_args(arg_parser)
    _add_runner_args(
        arg_parser,
        _filter_by_role(EMRJobRunner.OPT_NAMES, 'connect'))

    _alphabetize_actions(arg_parser)

    return arg_parser