def test_deprecated_runner_help(self): MRJob(['--help', '-r', 'emr', '--deprecated']) self.exit.assert_called_once_with(0) output = self.stdout.getvalue() # EMR runner option self.assertIn('--s3-endpoint', output) # not runner options self.assertNotIn('--conf', output) self.assertNotIn('--step-num', output) # a runner option, but not for EMR self.assertNotIn('--gcp-project', output)
def test_libjars_environment_variables(self): job_dir = os.path.dirname(MRJob.mr_job_script()) with patch.dict('os.environ', A='/path/to/a', B='b'): with patch.object(MRJob, 'LIBJARS', ['$A/cookie.jar', '$B/honey.jar']): job = MRJob() # libjars() peeks into envvars to figure out if the path # is relative or absolute self.assertEqual( job._runner_kwargs()['libjars'], ['$A/cookie.jar', os.path.join(job_dir, '$B/honey.jar')])
def test_spark_with_step_num(self): job = MRJob(['--step-num=1', '--spark', 'input_dir', 'output_dir']) mapper = MagicMock() spark = MagicMock() job.steps = Mock( return_value=[MRStep( mapper=mapper), SparkStep(spark)]) job.execute() spark.assert_called_once_with('input_dir', 'output_dir') self.assertFalse(mapper.called)
def make_option_parser(): usage = 'usage: %prog [options] JOB_FLOW_ID' description = ( 'List, display, and parse Hadoop logs associated with EMR job flows.' ' Useful for debugging failed jobs for which mrjob did not display a' ' useful error message or for inspecting jobs whose output has been' ' lost.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-f', '--find-failure', dest='find_failure', action='store_true', default=False, help=('Search the logs for information about why' ' the job failed')) option_parser.add_option('-l', '--list', dest='list_relevant', action="store_true", default=False, help='List log files MRJob finds relevant') option_parser.add_option('-L', '--list-all', dest='list_all', action="store_true", default=False, help='List all log files') option_parser.add_option('-a', '--cat', dest='cat_relevant', action="store_true", default=False, help='Cat log files MRJob finds relevant') option_parser.add_option('-A', '--cat-all', dest='cat_all', action="store_true", default=False, help='Cat all log files to JOB_FLOW_ID/') option_parser.add_option('-s', '--step-num', dest='step_num', action='store', type='int', default=None, help=('Limit results to a single step. To be used' ' with --list and --cat.')) option_parser.add_option('--counters', dest='get_counters', action='store_true', default=False, help='Show counters from the job flow') assignments = { option_parser: ('conf_paths', 'quiet', 'verbose', 'ec2_key_pair_file', 's3_sync_wait_time') } mr_job = MRJob() job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group, mr_job.proto_opt_group, mr_job.runner_opt_group, mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group, mr_job.hadoop_opts_opt_group) scrape_options_into_new_groups(job_option_groups, assignments) return option_parser
def test_bad_option_types(self): mr_job = MRJob() assert_raises(OptionError, mr_job.add_passthrough_option, '--stop-words', dest='stop_words', type='set', default=None) assert_raises(OptionError, mr_job.add_passthrough_option, '--leave-a-msg', dest='leave_a_msg', action='callback', default=None)
def test_spark_and_spark_args_methods(self): j = MRJob(['--no-conf']) j.spark = MagicMock() j.spark_args = MagicMock(return_value=['argh', 'ARRRRGH!']) self.assertEqual( j.steps(), [SparkStep(j.spark, spark_args=['argh', 'ARRRRGH!'])] ) self.assertEqual( j._steps_desc(), [dict(type='spark', jobconf={}, spark_args=['argh', 'ARRRRGH!'])] )
def test_commas_in_counters(self): # commas should be replaced with semicolons mr_job = MRJob().sandbox() mr_job.increment_counter('Bad items', 'a, b, c') mr_job.increment_counter('girl, interrupted', 'movie') self.assertEqual(mr_job.parse_counters(), { 'Bad items': { 'a; b; c': 1 }, 'girl; interrupted': { 'movie': 1 } })
def test_cmd_line_options(self): mr_job = MRJob([ '--jobconf', 'mapred.foo=bar', '--jobconf', 'mapred.foo=baz', '--jobconf', 'mapred.qux=quux', ]) self.assertEqual( mr_job.job_runner_kwargs()['jobconf'], { 'mapred.foo': 'baz', # second option takes priority 'mapred.qux': 'quux' })
def test_commas_in_counters(self): # commas should be replaced with semicolons mr_job = MRJob().sandbox() mr_job.increment_counter('Bad items', 'a, b, c') mr_job.increment_counter('girl, interrupted', 'movie') parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr['counters'], { 'Bad items': { 'a; b; c': 1 }, 'girl; interrupted': { 'movie': 1 } })
def test_counters_and_status(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar') mr_job.set_status('Initializing qux gradients...') mr_job.increment_counter('Foo', 'Bar') mr_job.increment_counter('Foo', 'Baz', 20) mr_job.set_status('Sorting metasyntactic variables...') parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr, {'counters': {'Foo': {'Bar': 2, 'Baz': 20}}, 'statuses': ['Initializing qux gradients...', 'Sorting metasyntactic variables...'], 'other': []})
def test_basic_help_deprecated(self): job = MRJob(['--help', '--deprecated']) self.exit.assert_called_once_with(0) output = self.stdout.getvalue() # basic option self.assertIn('--conf', output) # not basic options self.assertNotIn('--step-num', output) self.assertNotIn('--s3-endpoint', output) # deprecated options self.assertIn('--partitioner', output) self.assertNotIn('add --deprecated', output) self.assertIn('--deprecated=DEPRECATED', output)
def test_negative_and_zero_counters(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar', -1) mr_job.increment_counter('Foo', 'Baz') mr_job.increment_counter('Foo', 'Baz', -1) mr_job.increment_counter('Qux', 'Quux', 0) self.assertEqual(mr_job.parse_counters(), { 'Foo': { 'Bar': -1, 'Baz': 0 }, 'Qux': { 'Quux': 0 } })
def test_negative_and_zero_counters(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar', -1) mr_job.increment_counter('Foo', 'Baz') mr_job.increment_counter('Foo', 'Baz', -1) mr_job.increment_counter('Qux', 'Quux', 0) parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr['counters'], { 'Foo': { 'Bar': -1, 'Baz': 0 }, 'Qux': { 'Quux': 0 } })
def test_basic_help_deprecated(self): MRJob(['--help', '--deprecated']) self.exit.assert_called_once_with(0) output = self.stdout.getvalue() # basic option self.assertIn('--conf', output) # not basic options self.assertNotIn('--step-num', output) self.assertNotIn('--s3-endpoint', output) # deprecated options # currently there are no deprecated options to test against #self.assertIn('--partitioner', output) self.assertNotIn('add --deprecated', output) self.assertIn('--deprecated', output)
def main(): usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR job' ' flow. Store stdout and stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) assignments = { option_parser: ('conf_path', 'quiet', 'verbose', 'ec2_key_pair_file') } option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " JOB_FLOW_ID)") mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) options, args = option_parser.parse_args() if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) job_flow_id, cmd_string = args[:2] cmd_args = shlex.split(cmd_string) output_dir = os.path.abspath(options.output_dir or job_flow_id) with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def test_show_steps(self): mr_job = MRJob(['--steps']) mr_job.sandbox() mr_job.show_steps() assert_equal(mr_job.stdout.getvalue(), 'M\n') mr_boring_job = MRBoringJob(['--steps']) mr_boring_job.sandbox() mr_boring_job.show_steps() assert_equal(mr_boring_job.stdout.getvalue(), 'MR\n') # final mappers don't show up in the step description mr_final_boring_job = MRFinalBoringJob(['--steps']) mr_final_boring_job.sandbox() mr_final_boring_job.show_steps() assert_equal(mr_final_boring_job.stdout.getvalue(), 'MR\n') mr_two_step_job = MRTwoStepJob(['--steps']) mr_two_step_job.sandbox() mr_two_step_job.show_steps() assert_equal(mr_two_step_job.stdout.getvalue(), 'MR M\n')
def main(cl_args=None): usage = 'usage: %prog CLUSTER_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('ec2_key_pair_file', 'ssh_bin'), }) alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) cluster_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or cluster_id) with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR job flow to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob.tools.emr.terminate_idle_job_flows in your' ' crontab; job flows left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) add_basic_opts(option_parser) # these aren't nicely broken down, just scrape specific options scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ( 'bootstrap_mrjob', 'label', 'owner', ), }) add_emr_connect_opts(option_parser) add_emr_launch_opts(option_parser) alphabetize_options(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR job flow to run jobs in. WARNING: do not run' ' this without mrjob.tools.emr.terminate_idle_job_flows in your' ' crontab; job flows left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) def make_option_group(halp): g = OptionGroup(option_parser, halp) option_parser.add_option_group(g) return g runner_group = make_option_group('Running the entire job') hadoop_emr_opt_group = make_option_group( 'Running on Hadoop or EMR (these apply when you set -r hadoop or -r' ' emr)') emr_opt_group = make_option_group( 'Running on Amazon Elastic MapReduce (these apply when you set -r' ' emr)') assignments = { runner_group: ( 'bootstrap_mrjob', 'conf_path', 'quiet', 'verbose' ), hadoop_emr_opt_group: ( 'label', 'owner', ), emr_opt_group: ( 'additional_emr_info', 'aws_availability_zone', 'aws_region', 'bootstrap_actions', 'bootstrap_cmds', 'bootstrap_files', 'bootstrap_python_packages', 'ec2_core_instance_bid_price', 'ec2_core_instance_type', 'ec2_instance_type', 'ec2_key_pair', 'ec2_master_instance_bid_price', 'ec2_master_instance_type', 'ec2_slave_instance_type', 'ec2_task_instance_bid_price', 'ec2_task_instance_type', 'emr_endpoint', 'emr_job_flow_pool_name', 'enable_emr_debugging', 'hadoop_version', 'num_ec2_core_instances', 'num_ec2_instances', 'num_ec2_task_instances', 'pool_emr_job_flows', 's3_endpoint', 's3_log_uri', 's3_scratch_uri', 's3_sync_wait_time', ), } # Scrape options from MRJob and index them by dest mr_job = MRJob() job_option_groups = mr_job.all_option_groups() scrape_options_into_new_groups(job_option_groups, assignments) return option_parser
def test_empty(self): mr_job = MRJob() self.assertEqual(mr_job._runner_kwargs()['partitioner'], None)
def test_override_libjars(self): with patch.object(MRJob, 'libjars', return_value=['honey.jar']): job = MRJob(['--libjar', 'cookie.jar']) # ignore switch, don't resolve relative path self.assertEqual(job._runner_kwargs()['libjars'], ['honey.jar'])
def test_libjars_attr_plus_option(self): with patch.object(MRJob, 'LIBJARS', ['/left/dora.jar']): job = MRJob(['--libjar', 'honey.jar']) self.assertEqual(job._runner_kwargs()['libjars'], ['/left/dora.jar', 'honey.jar'])
def test_libjars_attr(self): with patch.object(MRJob, 'LIBJARS', ['/left/dora.jar']): job = MRJob() self.assertEqual(job._runner_kwargs()['libjars'], ['/left/dora.jar'])
def test_libjar_option(self): job = MRJob(['--libjar', 'honey.jar']) self.assertEqual(job._runner_kwargs()['libjars'], ['honey.jar'])
def test_default(self): job = MRJob() self.assertEqual(job._runner_kwargs()['libjars'], [])
def test_empty(self): mr_job = MRJob() self.assertEqual(mr_job._runner_kwargs()['jobconf'], {})
def test_unicode_counter(self): mr_job = MRJob().sandbox() # shouldn't raise an exception mr_job.increment_counter(u'💩', 'x', 1)
def test_unicode_set_status(self): mr_job = MRJob().sandbox() # shouldn't raise an exception mr_job.set_status(u'💩')
def test_init_does_not_require_tzset(self): MRJob()
def test_default_protocol(self): job = MRJob() self.assertEqual( job.parse_output_line(b'1\t2\n'), (1, 2))