def make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR job flow to run jobs in. WARNING: do not run' ' this without mrjob.tools.emr.terminate.idle_job_flows in your' ' crontab; job flows left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g runner_group = make_option_group('Running the entire job') hadoop_emr_opt_group = make_option_group( 'Running on Hadoop or EMR (these apply when you set -r hadoop or -r' ' emr)') emr_opt_group = make_option_group( 'Running on Amazon Elastic MapReduce (these apply when you set -r' ' emr)') assignments = { runner_group: ('bootstrap_mrjob', 'conf_path', 'quiet', 'verbose'), hadoop_emr_opt_group: ( 'label', 'owner', ), emr_opt_group: ( 'additional_emr_info', 'aws_availability_zone', 'aws_region', 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_python_packages', 'ec2_instance_type', 'ec2_key_pair', 'ec2_master_instance_type', 'ec2_slave_instance_type', 'emr_endpoint', 'emr_job_flow_pool_name', 'enable_emr_debugging', 'hadoop_version', 'num_ec2_instances', 'pool_emr_job_flows', 's3_endpoint', 's3_log_uri', 's3_scratch_uri', 's3_sync_wait_time', ), } # Scrape options from MRJob and index them by dest mr_job = MRJob() job_option_groups = mr_job.all_option_groups() scrape_options_into_new_groups(job_option_groups, assignments) return option_parser
def main(): usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR job' ' flow. Store stdout and stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) assignments = { option_parser: ('conf_path', 'quiet', 'verbose', 'ec2_key_pair_file') } option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " JOB_FLOW_ID)") mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) options, args = option_parser.parse_args() if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) job_flow_id, cmd_string = args[:2] cmd_args = shlex.split(cmd_string) output_dir = os.path.abspath(options.output_dir or job_flow_id) with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR job flow to run jobs in. WARNING: do not run' ' this without mrjob.tools.emr.terminate_idle_job_flows in your' ' crontab; job flows left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g runner_group = make_option_group('Running the entire job') hadoop_emr_opt_group = make_option_group( 'Running on Hadoop or EMR (these apply when you set -r hadoop or -r' ' emr)') emr_opt_group = make_option_group( 'Running on Amazon Elastic MapReduce (these apply when you set -r' ' emr)') assignments = { runner_group: ( 'bootstrap_mrjob', 'conf_path', 'quiet', 'verbose' ), hadoop_emr_opt_group: ( 'label', 'owner', ), emr_opt_group: ( 'additional_emr_info', 'aws_availability_zone', 'aws_region', 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_python_packages', 'ec2_core_instance_bid_price', 'ec2_core_instance_type', 'ec2_instance_type', 'ec2_key_pair', 'ec2_master_instance_bid_price', 'ec2_master_instance_type', 'ec2_slave_instance_type', 'ec2_task_instance_bid_price', 'ec2_task_instance_type', 'emr_endpoint', 'emr_job_flow_pool_name', 'enable_emr_debugging', 'hadoop_version', 'num_ec2_core_instances', 'num_ec2_instances', 'num_ec2_task_instances', 'pool_emr_job_flows', 's3_endpoint', 's3_log_uri', 's3_scratch_uri', 's3_sync_wait_time', ), } # Scrape options from MRJob and index them by dest mr_job = MRJob() job_option_groups = mr_job.all_option_groups() scrape_options_into_new_groups(job_option_groups, assignments) return option_parser
def make_option_parser(): usage = '%prog [options]' description = ( 'Inspect available job flow pools or identify job flows suitable for' ' running a job with the specified options.') option_parser = OptionParser(usage=usage, description=description) def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g ec2_opt_group = make_option_group('EC2 instance configuration') hadoop_opt_group = make_option_group('Hadoop configuration') job_opt_group = make_option_group('Job flow configuration') assignments = { option_parser: ( 'conf_paths', 'emr_job_flow_pool_name', 'quiet', 'verbose', ), ec2_opt_group: ( 'aws_availability_zone', 'ec2_instance_type', 'ec2_key_pair', 'ec2_key_pair_file', 'ec2_master_instance_type', 'ec2_core_instance_type', 'emr_endpoint', 'num_ec2_instances', ), hadoop_opt_group: ( 'hadoop_version', 'label', 'owner', ), job_opt_group: ( 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_mrjob', 'bootstrap_python_packages', ), } option_parser.add_option('-a', '--all', action='store_true', default=False, dest='list_all', help=('List all available job flows without' ' filtering by configuration')) option_parser.add_option('-f', '--find', action='store_true', default=False, dest='find', help=('Find a job flow matching the pool name,' ' bootstrap configuration, and instance' ' number/type as specified on the command' ' line and in the configuration files')) option_parser.add_option('-t', '--terminate', action='store', default=None, dest='terminate', metavar='JOB_FLOW_ID', help=('Terminate all job flows in the given pool' ' (defaults to pool "default")')) # Scrape options from MRJob and index them by dest mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) return option_parser
def main(): usage = '%prog [options]' description = ( 'Inspect available job flow pools or identify job flows suitable for' ' running a job with the specified options.') option_parser = OptionParser(usage=usage, description=description) import boto.emr.connection boto.emr.connection.JobFlow.Fields.add('HadoopVersion') def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g ec2_opt_group = make_option_group('EC2 instance configuration') hadoop_opt_group = make_option_group('Hadoop configuration') job_opt_group = make_option_group('Job flow configuration') assignments = { option_parser: ( 'conf_path', 'emr_job_flow_pool_name', 'quiet', 'verbose', ), ec2_opt_group: ( 'aws_availability_zone', 'ec2_instance_type', 'ec2_key_pair', 'ec2_key_pair_file', 'ec2_master_instance_type', 'ec2_slave_instance_type', 'emr_endpoint', 'num_ec2_instances', ), hadoop_opt_group: ( 'hadoop_version', 'label', 'owner', ), job_opt_group: ( 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_mrjob', 'bootstrap_python_packages', ), } option_parser.add_option('-a', '--all', action='store_true', default=False, dest='list_all', help=('List all available job flows without' ' filtering by configuration')) option_parser.add_option('-f', '--find', action='store_true', default=False, dest='find', help=('Find a job flow matching the pool name,' ' bootstrap configuration, and instance' ' number/type as specified on the command' ' line and in the configuration files')) option_parser.add_option('-t', '--terminate', action='store', default=None, dest='terminate', metavar='JOB_FLOW_ID', help=('Terminate all job flows in the given pool' ' (defaults to pool "default")')) # Scrape options from MRJob and index them by dest mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) options, args = option_parser.parse_args() MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find', 'terminate'): del runner_kwargs[non_runner_kwarg] runner = EMRJobRunner(**runner_kwargs) if options.list_all: pprint_pools(runner) if options.find: sorted_job_flows = runner.usable_job_flows() if sorted_job_flows: jf = sorted_job_flows[-1] print 'You should use this one:' pprint_job_flow(jf) else: print 'No idle job flows match criteria' if options.terminate: terminate(runner, options.terminate)
def main(): usage = '%prog [options]' description = ( 'Inspect available job flow pools or identify job flows suitable for' ' running a job with the specified options.') option_parser = OptionParser(usage=usage, description=description) import boto.emr.connection boto.emr.connection.JobFlow.Fields.add('HadoopVersion') def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g ec2_opt_group = make_option_group('EC2 instance configuration') hadoop_opt_group = make_option_group('Hadoop configuration') job_opt_group = make_option_group('Job flow configuration') assignments = { option_parser: ( 'conf_path', 'emr_job_flow_pool_name', 'quiet', 'verbose', ), ec2_opt_group: ( 'aws_availability_zone', 'ec2_instance_type', 'ec2_key_pair', 'ec2_key_pair_file', 'ec2_master_instance_type', 'ec2_slave_instance_type', 'emr_endpoint', 'num_ec2_instances', ), hadoop_opt_group: ( 'hadoop_version', 'label', 'owner', ), job_opt_group: ( 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_mrjob', 'bootstrap_python_packages', ), } option_parser.add_option('-a', '--all', action='store_true', default=False, dest='list_all', help=('List all available job flows without' ' filtering by configuration')) option_parser.add_option('-f', '--find', action='store_true', default=False, dest='find', help=('Find a job flow matching the pool name,' ' bootstrap configuration, and instance' ' number/type as specified on the command' ' line and in the configuration files')) option_parser.add_option('-t', '--terminate', action='store', default=None, dest='terminate', metavar='JOB_FLOW_ID', help=('Terminate all job flows in the given pool' ' (defaults to pool "default")')) # Scrape options from MRJob and index them by dest mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) options, args = option_parser.parse_args() log_to_stream(name='mrjob', debug=options.verbose) runner_kwargs = options.__dict__.copy() for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find', 'terminate'): del runner_kwargs[non_runner_kwarg] runner = EMRJobRunner(**runner_kwargs) if options.list_all: pprint_pools(runner) if options.find: sorted_job_flows = runner.usable_job_flows() if sorted_job_flows: jf = sorted_job_flows[-1] print 'You should use this one:' pprint_job_flow(jf) else: print 'No idle job flows match criteria' if options.terminate: terminate(runner, options.terminate)