def _make_option_parser(): usage = '%prog [options] <time-untouched> <URIs>' description = ( 'Delete all files in a given URI that are older than a specified' ' time.\n\nThe time parameter defines the threshold for removing' ' files. If the file has not been accessed for *time*, the file is' ' removed. The time argument is a number with an optional' ' single-character suffix specifying the units: m for minutes, h for' ' hours, d for days. If no suffix is specified, time is in hours.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") _add_basic_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('aws_region', 's3_endpoint'), }) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob terminate-idle-clusters in your' ' crontab; clusters left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) _add_basic_opts(option_parser) # these aren't nicely broken down, just scrape specific options scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ( 'bootstrap_mrjob', 'label', 'owner', ), }) _add_emr_connect_opts(option_parser) _add_emr_launch_opts(option_parser) _add_dataproc_emr_opts(option_parser) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options]' description = ('Terminate idle EMR clusters that meet the criteria' ' passed in on the command line (or, by default,' ' clusters that have been idle for one hour).') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--max-hours-idle', dest='max_hours_idle', default=None, type='float', help=('Max number of hours a cluster can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) option_parser.add_option( '--max-mins-locked', dest='max_mins_locked', default=_DEFAULT_MAX_MINUTES_LOCKED, type='float', help='Max number of minutes a cluster can be locked while idle.') option_parser.add_option( '--mins-to-end-of-hour', dest='mins_to_end_of_hour', default=None, type='float', help=('Terminate clusters that are within this many minutes of' ' the end of a full hour since the job started running' ' AND have no pending steps.')) option_parser.add_option('--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled clusters') option_parser.add_option('--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled clusters') option_parser.add_option( '--pool-name', dest='pool_name', default=None, help='Only terminate clusters in the given named pool.') option_parser.add_option( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") _add_basic_opts(option_parser) _add_dataproc_emr_connect_opts(option_parser) _add_emr_connect_opts(option_parser) _alphabetize_options(option_parser) return option_parser
def __init__(self, script_path=None, args=None, from_cl=False): """ :param script_path: Path to script unless it's the first item of *args* :param args: Command line arguments :param from_cl: If not using sys.argv but still comming from the command line (as opposed to a script, e.g. from mrjob.cmd), don't override the option parser error function (exit instead of throwing ValueError). """ if script_path is not None: script_path = os.path.abspath(script_path) self._script_path = script_path # make sure we respect the $TZ (time zone) environment variable if hasattr(time, 'tzset'): time.tzset() self._passthrough_options = [] self._file_options = [] self.option_parser = OptionParser(usage=self._usage(), option_class=self.OPTION_CLASS, add_help_option=False) self.configure_options() for opt_group in self.all_option_groups(): _alphabetize_options(opt_group) # don't pass None to parse_args unless we're actually running # the MRJob script if args is _READ_ARGS_FROM_SYS_ARGV: self._cl_args = sys.argv[1:] else: # don't pass sys.argv to self.option_parser, and have it # raise an exception on error rather than printing to stderr # and exiting. self._cl_args = args or [] def error(msg): raise ValueError(msg) if not from_cl: self.option_parser.error = error self.load_options(self._cl_args) # Make it possible to redirect stdin, stdout, and stderr, for testing # See sandbox(), below. # # These should always read/write bytes, not unicode if PY2: self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr else: self.stdin = sys.stdin.buffer self.stdout = sys.stdout.buffer self.stderr = sys.stderr.buffer
def _make_option_parser(): usage = '%prog [options]' description = ('Terminate idle EMR clusters that meet the criteria' ' passed in on the command line (or, by default,' ' clusters that have been idle for one hour).') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--max-hours-idle', dest='max_hours_idle', default=None, type='float', help=('Max number of hours a cluster can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) option_parser.add_option( '--max-mins-locked', dest='max_mins_locked', default=DEFAULT_MAX_MINUTES_LOCKED, type='float', help='Max number of minutes a cluster can be locked while idle.') option_parser.add_option( '--mins-to-end-of-hour', dest='mins_to_end_of_hour', default=None, type='float', help=('Terminate clusters that are within this many minutes of' ' the end of a full hour since the job started running' ' AND have no pending steps.')) option_parser.add_option( '--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled clusters') option_parser.add_option( '--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled clusters') option_parser.add_option( '--pool-name', dest='pool_name', default=None, help='Only terminate clusters in the given named pool.') option_parser.add_option( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") _add_basic_opts(option_parser) _add_emr_connect_opts(option_parser) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob terminate-idle-clusters in your' ' crontab; clusters left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) _add_basic_options(option_parser) _add_runner_options(option_parser, (_pick_runner_opts('emr', 'connect') | _pick_runner_opts('emr', 'launch'))) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options] cluster-id' description = 'Terminate an existing EMR cluster.' option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") _add_basic_opts(option_parser) _add_emr_connect_opts(option_parser) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options]' description = 'Print a giant report on EMR usage.' option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--max-days-ago', dest='max_days_ago', type='float', default=None, help=('Max number of days ago to look at jobs. By default, we go back' ' as far as EMR supports (currently about 2 months)')) _add_basic_opts(option_parser) _add_emr_connect_opts(option_parser) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR cluster to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob terminate-idle-clusters in your' ' crontab; clusters left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) _add_basic_options(option_parser) _add_runner_options( option_parser, (_pick_runner_opts('emr', 'connect') | _pick_runner_opts('emr', 'launch'))) _alphabetize_options(option_parser) return option_parser
def _make_option_parser(): usage = '%prog [options]' description = ('Report jobs running for more than a certain number of' ' hours (by default, %.1f). This can help catch buggy jobs' ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS) option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--min-hours', dest='min_hours', type='float', default=DEFAULT_MIN_HOURS, help=('Minimum number of hours a job can run before we report it.' ' Default: %default')) _add_basic_opts(option_parser) _add_emr_connect_opts(option_parser) _alphabetize_options(option_parser) return option_parser
def main(cl_args=None): usage = 'usage: %prog CLUSTER_ID [options] "command string"' description = ('Run a command on the master and all slaves of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") _add_basic_opts(option_parser) _add_dataproc_emr_connect_opts(option_parser) _add_emr_connect_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('ec2_key_pair_file', 'ssh_bin'), }) _alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) cluster_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or cluster_id) with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner: _run_on_all_nodes(runner, output_dir, cmd_args)
def main(cl_args=None): usage = 'usage: %prog CLUSTER_ID [options] "command string"' description = ('Run a command on the master and all slaves of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") _add_basic_options(option_parser) _add_runner_options( option_parser, _pick_runner_opts('emr', 'connect') | set( ['ssh_bin', 'ec2_key_pair_file']) ) _alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) cluster_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or cluster_id) with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner: _run_on_all_nodes(runner, output_dir, cmd_args)
def _make_option_parser(): usage = "%prog [options]" description = ( "Report jobs running for more than a certain number of" " hours (by default, %.1f). This can help catch buggy jobs" " and Hadoop/EMR operational issues." % DEFAULT_MIN_HOURS ) option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( "--min-hours", dest="min_hours", type="float", default=DEFAULT_MIN_HOURS, help=("Minimum number of hours a job can run before we report it." " Default: %default"), ) _add_basic_opts(option_parser) _add_emr_connect_opts(option_parser) _alphabetize_options(option_parser) return option_parser