def main(): parser = ArgumentParser() parser.add_argument('output', help='ZipNum Cluster Output directory') parser.add_argument('inputs', nargs='+', help='CDX Input glob eg: /cdx/*.cdx.gz') parser.add_argument('-s', '--shards', default=10, type=int, help='Number of ZipNum Cluster shards to create') parser.add_argument('-l', '--numlines', default=3000, type=int, help='Number of lines per gzip block (default 3000)') parser.add_argument('-p', '--parallel', action='store_true', help='Run in parllel (multiple maps/reducer processes)') r = parser.parse_args() MRJobLauncher.set_up_logging(quiet=False, verbose=False, stream=sys.stderr) log.setLevel(logging.INFO) compat_log = logging.getLogger('mrjob.compat') compat_log.setLevel(logging.ERROR) run_job(r.inputs, r.output, r.shards, r.parallel, r.numlines) build_summary_and_loc(r.output)
def test_hadoop_runner(self): # you can't instantiate a HadoopJobRunner without Hadoop installed launcher = MRJobLauncher(args=["--no-conf", "-r", "hadoop", "", "--hadoop-streaming-jar", "HUNNY"]) with no_handlers_for_logger("mrjob.runner"): with patch.dict(os.environ, {"HADOOP_HOME": "100-Acre Wood"}): with launcher.make_runner() as runner: self.assertIsInstance(runner, HadoopJobRunner)
def test_hadoop_runner(self): # you can't instantiate a HadoopJobRunner without Hadoop installed launcher = MRJobLauncher(args=['--no-conf', '-r', 'hadoop', '', '--hadoop-streaming-jar', 'HUNNY']) with no_handlers_for_logger('mrjob.runner'): with patch.dict(os.environ, {'HADOOP_HOME': '100-Acre Wood'}): with launcher.make_runner() as runner: self.assertIsInstance(runner, HadoopJobRunner)
def test_hadoop_runner(self): # you can't instantiate a HadoopJobRunner without Hadoop installed launcher = MRJobLauncher(args=[ '--no-conf', '-r', 'hadoop', '', '--hadoop-streaming-jar', 'HUNNY' ]) with no_handlers_for_logger('mrjob.runner'): with patch.dict(os.environ, {'HADOOP_HOME': '100-Acre Wood'}): with launcher.make_runner() as runner: self.assertIsInstance(runner, HadoopJobRunner)
def test_no_output(self): launcher = MRJobLauncher(args=['--no-conf', '--no-output', '']) launcher.sandbox() with patch.object(launcher, 'make_runner') as m_make_runner: runner = RunnerStub() _mock_context_mgr(m_make_runner, runner) runner.stream_output.return_value = ['a line'] launcher.run_job() self.assertEqual(launcher.stdout.getvalue(), '') self.assertEqual(launcher.stderr.getvalue(), '')
def test_no_output(self): launcher = MRJobLauncher(args=["--no-conf", "--no-output", ""]) launcher.sandbox() with patch.object(launcher, "make_runner") as m_make_runner: runner = Mock() _mock_context_mgr(m_make_runner, runner) runner.stream_output.return_value = ["a line"] launcher.run_job() self.assertEqual(launcher.stdout.getvalue(), "") self.assertEqual(launcher.stderr.getvalue(), "")
def test_no_file_args_required(self): words1 = self.makefile('words1', b'kit and caboodle\n') words2 = self.makefile('words2', b'baubles\nbangles and beads\n') job = MRJobLauncher( args=['-r', 'local', tests.sr_wc.__file__, words1, words2]) job.sandbox() with job.make_runner() as runner: runner.run() lines = list(to_lines(runner.cat_output())) self.assertEqual(len(lines), 1) self.assertEqual(int(lines[0]), 7)
def test_custom_key_value_option_parsing(self): # simple example mr_job = MRJobLauncher(args=['--cmdenv', 'FOO=bar', '']) self.assertEqual(mr_job.options.cmdenv, {'FOO': 'bar'}) # trickier example mr_job = MRJobLauncher(args=[ '', '--cmdenv', 'FOO=bar', '--cmdenv', 'FOO=baz', '--cmdenv', 'BAZ=qux=quux']) self.assertEqual(mr_job.options.cmdenv, {'FOO': 'baz', 'BAZ': 'qux=quux'}) # must have KEY=VALUE self.assertRaises(ValueError, MRJobLauncher, args=['--cmdenv', 'FOO', ''])
def test_bad_option_types(self): mr_job = MRJobLauncher(args=['']) self.assertRaises( OptionError, mr_job.add_passthrough_option, '--stop-words', dest='stop_words', type='set', default=None) self.assertRaises( OptionError, mr_job.add_passthrough_option, '--leave-a-msg', dest='leave_a_msg', action='callback', default=None)
def test_normal_python(self): launcher = MRJobLauncher(args=['/path/to/script']) if PY2: self.assertEqual(launcher.stdin, sys.stdin) self.assertEqual(launcher.stdout, sys.stdout) self.assertEqual(launcher.stderr, sys.stderr) else: self.assertEqual(launcher.stdin, sys.stdin.buffer) self.assertEqual(launcher.stdout, sys.stdout.buffer) self.assertEqual(launcher.stderr, sys.stderr.buffer)
def _test_job_runner_kwargs(self, runner_class, conf_only_options=()): launcher = MRJobLauncher(args=['/path/to/script']) method_name = '%s_job_runner_kwargs' % runner_class.alias kwargs = getattr(launcher, method_name)() option_names = set(kwargs) - self.NON_OPTION_KWARGS self.assertEqual(option_names, (runner_class.OPTION_STORE_CLASS.ALLOWED_KEYS - self.CONF_ONLY_OPTIONS))
def test_no_output(self): launcher = MRJobLauncher(args=['--no-conf', '--no-output', '']) launcher.sandbox() with patch.object(launcher, 'make_runner') as m_make_runner: runner = Mock() _mock_context_mgr(m_make_runner, runner) runner.stream_output.return_value = ['a line'] launcher.run_job() self.assertEqual(launcher.stdout.getvalue(), b'') self.assertEqual(launcher.stderr.getvalue(), b'')
def _test_options_appear_in_single_opt_group(self): launcher = MRJobLauncher(args=['/path/to/script']) dest_to_groups = defaultdict(set) for name, group in launcher.__dict__.items(): if not name.endswith('_opt_group'): continue for option in group.option_list: dest_to_groups[option.dest].add(name) dest_to_multiple_groups = dict( (dest, groups) for dest, groups in dest_to_groups.items() if len(groups) > 1) self.assertEqual(dest_to_multiple_groups, {})
def test_python3_jupyter_notebook(self): # regression test for #1441 # this actually works on any Python platform, since we use mocks mock_stdin = Mock() mock_stdin.buffer = Mock() mock_stdout = Mock() del mock_stdout.buffer mock_stderr = Mock() del mock_stderr.buffer with patch.multiple(sys, stdin=mock_stdin, stdout=mock_stdout, stderr=mock_stderr): launcher = MRJobLauncher(args=['/path/to/script']) self.assertEqual(launcher.stdin, mock_stdin.buffer) self.assertEqual(launcher.stdout, mock_stdout) self.assertEqual(launcher.stderr, mock_stderr)
def _make_launcher(self, *args): """Make a launcher, add a mock runner (``launcher.mock_runner``), and set it up so that ``launcher.make_runner().__enter__()`` returns ``launcher.mock_runner()``. """ launcher = MRJobLauncher(args=['--no-conf', ''] + list(args)) launcher.sandbox() launcher.mock_runner = Mock() launcher.mock_runner.stream_output.return_value = [b'a line\n'] launcher.make_runner = MagicMock() # include __enter__ launcher.make_runner.return_value.__enter__.return_value = ( launcher.mock_runner) return launcher
def _run(args): from mrjob.launch import MRJobLauncher MRJobLauncher(args=args, from_cl=True).run_job()
def test_emr_runner(self): launcher = MRJobLauncher(args=["--no-conf", "-r", "emr", ""]) with no_handlers_for_logger("mrjob"): with patch_fs_s3(): with launcher.make_runner() as runner: self.assertIsInstance(runner, EMRJobRunner)
def test_local_runner(self): launcher = MRJobLauncher(args=['--no-conf', '-r', 'local', '']) with no_handlers_for_logger('mrjob.runner'): with launcher.make_runner() as runner: self.assertIsInstance(runner, LocalMRJobRunner)
def test_emr_runner(self): launcher = MRJobLauncher(args=['--no-conf', '-r', 'emr', '']) with no_handlers_for_logger('mrjob'): with patch_fs_s3(): with launcher.make_runner() as runner: self.assertIsInstance(runner, EMRJobRunner)
def test_emr_runner(self): launcher = MRJobLauncher(args=['--no-conf', '-r', 'emr', '']) with no_handlers_for_logger('mrjob.runner'): with launcher.make_runner() as runner: self.assertIsInstance(runner, EMRJobRunner)
def test_local_runner(self): launcher = MRJobLauncher(args=["--no-conf", "-r", "local", ""]) with no_handlers_for_logger("mrjob.runner"): with launcher.make_runner() as runner: self.assertIsInstance(runner, LocalMRJobRunner)