def _mapper_arg_chain(self, step_dict, step_num, input_path): procs_args = [] procs_args.append(self._cat_args(input_path)) if 'mapper' in step_dict: procs_args.append( shlex_split(self._substep_cmd_line(step_num, 'mapper'))) if 'combiner' in step_dict: procs_args.append(['sort']) # _substep_args may return more than one process procs_args.append( shlex_split(self._substep_cmd_line(step_num, 'combiner'))) return procs_args
def _substep_args(self, step_num, mrc): step = self._get_step(step_num) if step[mrc]['type'] == 'command': cmd = step[mrc]['command'] # never wrap custom hadoop streaming commands in bash if isinstance(cmd, string_types): return shlex_split(cmd) else: return cmd elif step[mrc]['type'] == 'script': script_args = self._script_args_for_step( step_num, mrc, input_manifest=step.get('input_manifest')) if 'pre_filter' in step[mrc]: return self._sh_wrap( '%s | %s' % (step[mrc]['pre_filter'], cmd_line(script_args))) else: return script_args else: raise ValueError("Invalid %s step %d: %r" % ( mrc, step_num, step[mrc]))
def _substep_args(self, step_dict, step_num, mrc, input_path=None): if step_dict['type'] != 'streaming': raise Exception("LocalMRJobRunner cannot run %s steps." % step_dict['type']) if step_dict[mrc]['type'] == 'command': if input_path is None: return [shlex_split(step_dict[mrc]['command'])] else: return [['cat', input_path], shlex_split(step_dict[mrc]['command'])] if step_dict[mrc]['type'] == 'script': args = self._script_args_for_step(step_num, mrc) if input_path is None: return [args] else: return [args + [input_path]]
def _substep_args(self, step_dict, step_num, mrc, input_path=None): if step_dict['type'] != 'streaming': raise Exception("LocalMRJobRunner cannot run %s steps." % step_dict['type']) if step_dict[mrc]['type'] == 'command': if input_path is None: return [shlex_split(step_dict[mrc]['command'])] else: return [ ['cat', input_path], shlex_split(step_dict[mrc]['command'])] if step_dict[mrc]['type'] == 'script': args = self._script_args_for_step(step_num, mrc) if input_path is None: return [args] else: return [args + [input_path]]
def get_mock_hadoop_cmd_args(): """Get a list for each invocation of hadoop, each containing a list of arguments (not including the hadoop binary's path).""" cmd_log = os.path.join(get_mock_dir(), 'cmd.log') if not os.path.exists(cmd_log): return [] with open(cmd_log) as f: return [shlex_split(cmd) for cmd in f]
def _reducer_arg_chain(self, step_dict, step_num, input_path): if 'reducer' not in step_dict: return [] procs_args = [] procs_args.append(self._cat_args(input_path)) procs_args.append( shlex_split(self._substep_cmd_line(step_num, 'reducer'))) return procs_args
def combine_cmds(*cmds): """Take zero or more commands to run on the command line, and return the last one that is not ``None``. Each command should either be a list containing the command plus switches, or a string, which will be parsed with :py:func:`shlex.split`. The string must either be a byte string or a unicode string containing no non-ASCII characters. Returns either ``None`` or a list containing the command plus arguments. """ cmd = combine_values(*cmds) if cmd is None: return None elif isinstance(cmd, basestring): return shlex_split(cmd) else: return list(cmd)
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] def make_job(*args): j = job_class(job_args + list(args)) j.sandbox() # so Spark doesn't try to serialize stdin return j # get job steps. don't pass --steps, which is deprecated steps = make_job().steps() # pick steps start = args.first_step_num end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] # load initial data from pyspark import SparkContext sc = SparkContext() rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_job) # write the results rdd.saveAsTextFile( args.output_path, compressionCodecClass=args.compression_codec)
def main(): usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR job' ' flow. Store stdout and stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) assignments = { option_parser: ('conf_paths', 'quiet', 'verbose', 'ec2_key_pair_file') } option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " JOB_FLOW_ID)") mr_job = MRJob() scrape_options_into_new_groups(mr_job.all_option_groups(), assignments) options, args = option_parser.parse_args() MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) job_flow_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or job_flow_id) with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def main(cl_args=None): usage = 'usage: %(prog)s CLUSTER_ID [options] "command string"' description = ('Run a command on the master and all worker nodes of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') arg_parser = ArgumentParser(usage=usage, description=description) arg_parser.add_argument('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") arg_parser.add_argument(dest='cluster_id', help='ID of cluster to run command on') arg_parser.add_argument(dest='cmd_string', help='command to run, as a single string') _add_basic_args(arg_parser) _add_runner_args( arg_parser, {'ec2_key_pair_file', 'ssh_bin'} | _filter_by_role( EMRJobRunner.OPT_NAMES, 'connect') ) _alphabetize_actions(arg_parser) options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('cluster_id', 'cmd_string', 'output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] cmd_args = shlex_split(options.cmd_string) output_dir = os.path.abspath(options.output_dir or options.cluster_id) with EMRJobRunner( cluster_id=options.cluster_id, **runner_kwargs) as runner: _run_on_all_nodes(runner, output_dir, cmd_args)
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] def make_job(*args): j = job_class(job_args + list(args)) j.sandbox() # so Spark doesn't try to serialize stdin return j # load initial data from pyspark import SparkContext sc = SparkContext() rdd = sc.textFile(args.input_path, use_unicode=False) # get job steps. don't pass --steps, which is deprecated steps = make_job().steps() # process steps steps_to_run = list(enumerate(steps))[args.start_step:args.end_step] for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_job) # write the results rdd.saveAsTextFile( args.output_path, compressionCodecClass=args.compression_codec)
def main(cl_args=None): usage = 'usage: %prog CLUSTER_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('ec2_key_pair_file', 'ssh_bin'), }) alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) cluster_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or cluster_id) with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def main(cl_args=None): usage = 'usage: %prog CLUSTER_ID [options] "command string"' description = ('Run a command on the master and all slaves of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") _add_basic_options(option_parser) _add_runner_options( option_parser, _pick_runner_opts('emr', 'connect') | set( ['ssh_bin', 'ec2_key_pair_file']) ) _alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) cluster_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or cluster_id) with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner: _run_on_all_nodes(runner, output_dir, cmd_args)
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # get job steps. don't pass --steps, which is deprecated job = job_class(job_args) steps = job.steps() # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, name, amount=1): counter_accumulator.add({group: {name: amount}}) return increment_counter def make_job(mrc, step_num): j = job_class(job_args + ['--%s' % mrc, '--step-num=%d' % step_num]) j.sandbox() # so Spark doesn't try to serialize stdin # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) return j try: if job.hadoop_input_format() is not None: rdd = sc.hadoopFile(args.input_path, inputFormatClass=job.hadoop_input_format(), keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the contents # of the line are the key and the value is an empty string. Convert to # an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_job, args.num_reducers) # write the results if job.hadoop_output_format() is not None: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to that # format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile(args.output_path, outputFormatClass=job.hadoop_output_format(), compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile(args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] sc.parallelize([json.dumps(counters)], numSlices=1).saveAsTextFile(args.counter_output_dir)
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(['-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar'] + list(args) + ['-', local_input_path, remote_input_path] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] self.assertEqual(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) self.assertEqual(os.listdir(home_dir), ['tmp']) self.assertEqual(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) self.assertEqual(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is was uploaded self.assertTrue(os.path.exists(runner._mrjob_tar_gz_path)) self.assertIn(runner._mrjob_tar_gz_path, runner._upload_mgr.path_to_uri()) # make sure setup script exists, and mrjob.tar.gz is added # to PYTHONPATH in it self.assertTrue(os.path.exists(runner._setup_wrapper_script_path)) self.assertIn(runner._setup_wrapper_script_path, runner._upload_mgr.path_to_uri()) mrjob_tar_gz_name = runner._working_dir_mgr.name( 'archive', runner._mrjob_tar_gz_path) with open(runner._setup_wrapper_script_path) as wrapper: self.assertTrue(any( ('export PYTHONPATH' in line and mrjob_tar_gz_name in line) for line in wrapper)) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex_split(cmd) for cmd in mock_log] jar_cmd_args = [cmd_args for cmd_args in hadoop_cmd_args if cmd_args[:1] == ['jar']] self.assertEqual(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format self.assertIn('-inputformat', step_0_args) self.assertNotIn('-outputformat', step_0_args) self.assertNotIn('-inputformat', step_1_args) self.assertIn('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): self.assertIn('-libjar', args) self.assertIn('-mapper', args) self.assertLess(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through self.assertIn('-D', step_0_args) self.assertIn('x=y', step_0_args) self.assertIn('-D', step_1_args) # job overrides jobconf in step 1 self.assertIn('x=z', step_1_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def _filter_if_any(self, substep_dict): if substep_dict['type'] == 'script': if 'pre_filter' in substep_dict: return shlex_split(substep_dict['pre_filter']) return None
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar' ] + list(args) + ['-', local_input_path, remote_input_path] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] self.assertEqual(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) self.assertEqual(os.listdir(home_dir), ['tmp']) self.assertEqual(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) self.assertEqual(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is was uploaded and added to PYTHONPATH self.assertIsNotNone(runner._mrjob_tar_gz_path) self.assertIn(runner._mrjob_tar_gz_path, runner._upload_mgr.path_to_uri()) name = runner._working_dir_mgr.name('archive', runner._mrjob_tar_gz_path) pythonpath = runner._get_cmdenv()['PYTHONPATH'] self.assertIn(name, pythonpath.split(':')) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex_split(line) for line in mock_log] jar_cmd_args = [ args for args in hadoop_cmd_args if args[:1] == ['jar'] ] self.assertEqual(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format self.assertIn('-inputformat', step_0_args) self.assertNotIn('-outputformat', step_0_args) self.assertNotIn('-inputformat', step_1_args) self.assertIn('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): self.assertIn('-libjar', args) self.assertIn('-mapper', args) self.assertLess(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through self.assertIn('-D', step_0_args) self.assertIn('x=y', step_0_args) self.assertIn('-D', step_1_args) # job overrides jobconf in step 1 self.assertIn('x=z', step_1_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # determine hadoop_*_format, steps # try to avoid instantiating a job in the driver; see #2044 job = None if args.hadoop_input_format is None: job = job or job_class(job_args) hadoop_input_format = job.hadoop_input_format() else: hadoop_input_format = args.hadoop_input_format or None if args.hadoop_output_format is None: job = job or job_class(job_args) hadoop_output_format = job.hadoop_output_format() else: hadoop_output_format = args.hadoop_output_format or None if args.sort_values is None: job = job or job_class(job_args) sort_values = job.sort_values() else: sort_values = args.sort_values if args.steps_desc is None: job = job or job_class(job_args) steps = [step.description(step_num) for step_num, step in enumerate(job.steps())] else: steps = json.loads(args.steps_desc) # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, name, amount=1): counter_accumulator.add({group: {name: amount}}) return increment_counter def make_mrc_job(mrc, step_num): j = job_class(job_args + [ '--%s' % mrc, '--step-num=%d' % step_num ]) # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) return j try: if hadoop_input_format: rdd = sc.hadoopFile( args.input_path, inputFormatClass=hadoop_input_format, keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the # contents of the line are the key and the value is an empty # string. Convert to an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_mrc_job, args.num_reducers, sort_values) # max_output_files: limit number of partitions if args.max_output_files: rdd = rdd.coalesce(args.max_output_files) # write the results if hadoop_output_format: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to # that format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile( args.output_path, outputFormatClass=hadoop_output_format, compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile( args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] sc.parallelize( [json.dumps(counters)], numSlices=1 ).saveAsTextFile( args.counter_output_dir )
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # determine hadoop_*_format, steps # try to avoid instantiating a job in the driver; see #2044 job = None if args.hadoop_input_format is None: job = job or job_class(job_args) hadoop_input_format = job.hadoop_input_format() else: hadoop_input_format = args.hadoop_input_format or None if args.hadoop_output_format is None: job = job or job_class(job_args) hadoop_output_format = job.hadoop_output_format() else: hadoop_output_format = args.hadoop_output_format or None if args.sort_values is None: job = job or job_class(job_args) sort_values = job.sort_values() else: sort_values = args.sort_values if args.steps_desc is None: job = job or job_class(job_args) steps = [ step.description(step_num) for step_num, step in enumerate(job.steps()) ] else: steps = json.loads(args.steps_desc) # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, counter, amount=1): counter_accumulator.add({group: {counter: amount}}) return increment_counter def make_mrc_job(mrc, step_num): j = job_class(job_args + ['--%s' % mrc, '--step-num=%d' % step_num]) # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) # if skip_internal_protocol is true, patch internal_protocol() to # return an object whose *read* and *write* attributes are ``None`` if args.skip_internal_protocol: j.internal_protocol = lambda: _NO_INTERNAL_PROTOCOL return j # --emulate-map-input-file doesn't work with hadoop_input_format emulate_map_input_file = (args.emulate_map_input_file and not hadoop_input_format) try: if emulate_map_input_file: # load an rdd with pairs of (input_path, line). *path* here # has to be a single path, not a comma-separated list rdd = sc.union([ _text_file_with_path(sc, path) for path in args.input_path.split(',') ]) elif hadoop_input_format: rdd = sc.hadoopFile(args.input_path, inputFormatClass=hadoop_input_format, keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the # contents of the line are the key and the value is an empty # string. Convert to an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_mrc_job, args.num_reducers, sort_values, emulate_map_input_file, args.skip_internal_protocol) # max_output_files: limit number of partitions if args.max_output_files: rdd = rdd.coalesce(args.max_output_files) # write the results if hadoop_output_format: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to # that format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile(args.output_path, outputFormatClass=hadoop_output_format, compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile(args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] # If the given path is an s3 path, use s3.parallelize, # otherwise just write them directly to the local dir if is_uri(args.counter_output_dir): sc.parallelize([json.dumps(counters)], numSlices=1).saveAsTextFile( args.counter_output_dir) else: # Use regular python built-in file writer if the part-* file # is not created path = os.path.join(args.counter_output_dir, "part-00000") if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) with open(path, 'w') as wb: wb.write(str(json.dumps(counters)))
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO("foo\nbar\n") local_input_path = os.path.join(self.tmp_dir, "input") with open(local_input_path, "w") as local_input_file: local_input_file.write("bar\nqux\n") input_to_upload = os.path.join(self.tmp_dir, "remote_input") with open(input_to_upload, "w") as input_to_upload_file: input_to_upload_file.write("foo\n") remote_input_path = "hdfs:///data/foo" check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output([""]) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob( ["-r", "hadoop", "-v", "--no-conf", "--hadoop-arg", "-libjar", "--hadoop-arg", "containsJars.jar"] + list(args) + ["-", local_input_path, remote_input_path] + ["--jobconf", "x=y"] ) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ["MOCK_HDFS_ROOT"] self.assertEqual(sorted(os.listdir(hdfs_root)), ["data", "user"]) home_dir = os.path.join(hdfs_root, "user", getpass.getuser()) self.assertEqual(os.listdir(home_dir), ["tmp"]) self.assertEqual(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"]) self.assertEqual(runner._opts["hadoop_extra_args"], ["-libjar", "containsJars.jar"]) # make sure mrjob.tar.gz is was uploaded self.assertTrue(os.path.exists(runner._mrjob_tar_gz_path)) self.assertIn(runner._mrjob_tar_gz_path, runner._upload_mgr.path_to_uri()) # make sure setup script exists, and mrjob.tar.gz is added # to PYTHONPATH in it self.assertTrue(os.path.exists(runner._setup_wrapper_script_path)) self.assertIn(runner._setup_wrapper_script_path, runner._upload_mgr.path_to_uri()) mrjob_tar_gz_name = runner._working_dir_mgr.name("archive", runner._mrjob_tar_gz_path) with open(runner._setup_wrapper_script_path) as wrapper: self.assertTrue(any(("export PYTHONPATH" in line and mrjob_tar_gz_name in line) for line in wrapper)) self.assertEqual(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ["MOCK_HADOOP_LOG"]) as mock_log: hadoop_cmd_args = [shlex_split(cmd) for cmd in mock_log] jar_cmd_args = [cmd_args for cmd_args in hadoop_cmd_args if cmd_args[:1] == ["jar"]] self.assertEqual(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format self.assertIn("-inputformat", step_0_args) self.assertNotIn("-outputformat", step_0_args) self.assertNotIn("-inputformat", step_1_args) self.assertIn("-outputformat", step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): self.assertIn("-libjar", args) self.assertIn("-mapper", args) self.assertLess(args.index("-libjar"), args.index("-mapper")) # make sure -jobconf made it through self.assertIn("-D", step_0_args) self.assertIn("x=y", step_0_args) self.assertIn("-D", step_1_args) # job overrides jobconf in step 1 self.assertIn("x=z", step_1_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))