def main():
    parser = ArgumentParser()
    parser.add_argument('output', help='ZipNum Cluster Output directory')
    parser.add_argument('inputs', nargs='+', help='CDX Input glob eg: /cdx/*.cdx.gz')
    parser.add_argument('-s', '--shards', default=10, type=int,
                        help='Number of ZipNum Cluster shards to create')

    parser.add_argument('-l', '--numlines', default=3000, type=int,
                        help='Number of lines per gzip block (default 3000)')

    parser.add_argument('-p', '--parallel', action='store_true',
                        help='Run in parllel (multiple maps/reducer processes)')

    r = parser.parse_args()

    MRJobLauncher.set_up_logging(quiet=False,
                                 verbose=False,
                                 stream=sys.stderr)

    log.setLevel(logging.INFO)
    compat_log = logging.getLogger('mrjob.compat')
    compat_log.setLevel(logging.ERROR)

    run_job(r.inputs, r.output, r.shards, r.parallel, r.numlines)
    build_summary_and_loc(r.output)
Пример #2
0
 def test_hadoop_runner(self):
     # you can't instantiate a HadoopJobRunner without Hadoop installed
     launcher = MRJobLauncher(args=["--no-conf", "-r", "hadoop", "", "--hadoop-streaming-jar", "HUNNY"])
     with no_handlers_for_logger("mrjob.runner"):
         with patch.dict(os.environ, {"HADOOP_HOME": "100-Acre Wood"}):
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, HadoopJobRunner)
Пример #3
0
 def test_hadoop_runner(self):
     # you can't instantiate a HadoopJobRunner without Hadoop installed
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'hadoop', '',
                                    '--hadoop-streaming-jar', 'HUNNY'])
     with no_handlers_for_logger('mrjob.runner'):
         with patch.dict(os.environ, {'HADOOP_HOME': '100-Acre Wood'}):
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, HadoopJobRunner)
Пример #4
0
 def test_hadoop_runner(self):
     # you can't instantiate a HadoopJobRunner without Hadoop installed
     launcher = MRJobLauncher(args=[
         '--no-conf', '-r', 'hadoop', '', '--hadoop-streaming-jar', 'HUNNY'
     ])
     with no_handlers_for_logger('mrjob.runner'):
         with patch.dict(os.environ, {'HADOOP_HOME': '100-Acre Wood'}):
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, HadoopJobRunner)
Пример #5
0
 def test_no_output(self):
     launcher = MRJobLauncher(args=['--no-conf', '--no-output', ''])
     launcher.sandbox()
     with patch.object(launcher, 'make_runner') as m_make_runner:
         runner = RunnerStub()
         _mock_context_mgr(m_make_runner, runner)
         runner.stream_output.return_value = ['a line']
         launcher.run_job()
         self.assertEqual(launcher.stdout.getvalue(), '')
         self.assertEqual(launcher.stderr.getvalue(), '')
Пример #6
0
 def test_no_output(self):
     launcher = MRJobLauncher(args=["--no-conf", "--no-output", ""])
     launcher.sandbox()
     with patch.object(launcher, "make_runner") as m_make_runner:
         runner = Mock()
         _mock_context_mgr(m_make_runner, runner)
         runner.stream_output.return_value = ["a line"]
         launcher.run_job()
         self.assertEqual(launcher.stdout.getvalue(), "")
         self.assertEqual(launcher.stderr.getvalue(), "")
Пример #7
0
    def test_no_file_args_required(self):
        words1 = self.makefile('words1', b'kit and caboodle\n')
        words2 = self.makefile('words2', b'baubles\nbangles and beads\n')

        job = MRJobLauncher(
            args=['-r', 'local', tests.sr_wc.__file__, words1, words2])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            lines = list(to_lines(runner.cat_output()))
            self.assertEqual(len(lines), 1)
            self.assertEqual(int(lines[0]), 7)
Пример #8
0
    def test_no_file_args_required(self):
        words1 = self.makefile('words1', b'kit and caboodle\n')
        words2 = self.makefile('words2', b'baubles\nbangles and beads\n')

        job = MRJobLauncher(
            args=['-r', 'local', tests.sr_wc.__file__, words1, words2])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            lines = list(to_lines(runner.cat_output()))
            self.assertEqual(len(lines), 1)
            self.assertEqual(int(lines[0]), 7)
Пример #9
0
    def test_custom_key_value_option_parsing(self):
        # simple example
        mr_job = MRJobLauncher(args=['--cmdenv', 'FOO=bar', ''])
        self.assertEqual(mr_job.options.cmdenv, {'FOO': 'bar'})

        # trickier example
        mr_job = MRJobLauncher(args=[
            '',
            '--cmdenv', 'FOO=bar',
            '--cmdenv', 'FOO=baz',
            '--cmdenv', 'BAZ=qux=quux'])
        self.assertEqual(mr_job.options.cmdenv,
                         {'FOO': 'baz', 'BAZ': 'qux=quux'})

        # must have KEY=VALUE
        self.assertRaises(ValueError, MRJobLauncher,
                          args=['--cmdenv', 'FOO', ''])
Пример #10
0
 def test_bad_option_types(self):
     mr_job = MRJobLauncher(args=[''])
     self.assertRaises(
         OptionError, mr_job.add_passthrough_option,
         '--stop-words', dest='stop_words', type='set', default=None)
     self.assertRaises(
         OptionError, mr_job.add_passthrough_option,
         '--leave-a-msg', dest='leave_a_msg', action='callback',
         default=None)
Пример #11
0
    def test_normal_python(self):
        launcher = MRJobLauncher(args=['/path/to/script'])

        if PY2:
            self.assertEqual(launcher.stdin, sys.stdin)
            self.assertEqual(launcher.stdout, sys.stdout)
            self.assertEqual(launcher.stderr, sys.stderr)
        else:
            self.assertEqual(launcher.stdin, sys.stdin.buffer)
            self.assertEqual(launcher.stdout, sys.stdout.buffer)
            self.assertEqual(launcher.stderr, sys.stderr.buffer)
Пример #12
0
    def _test_job_runner_kwargs(self, runner_class, conf_only_options=()):
        launcher = MRJobLauncher(args=['/path/to/script'])

        method_name = '%s_job_runner_kwargs' % runner_class.alias
        kwargs = getattr(launcher, method_name)()

        option_names = set(kwargs) - self.NON_OPTION_KWARGS

        self.assertEqual(option_names,
                         (runner_class.OPTION_STORE_CLASS.ALLOWED_KEYS -
                          self.CONF_ONLY_OPTIONS))
Пример #13
0
 def test_no_output(self):
     launcher = MRJobLauncher(args=['--no-conf', '--no-output', ''])
     launcher.sandbox()
     with patch.object(launcher, 'make_runner') as m_make_runner:
         runner = Mock()
         _mock_context_mgr(m_make_runner, runner)
         runner.stream_output.return_value = ['a line']
         launcher.run_job()
         self.assertEqual(launcher.stdout.getvalue(), b'')
         self.assertEqual(launcher.stderr.getvalue(), b'')
Пример #14
0
    def _test_options_appear_in_single_opt_group(self):
        launcher = MRJobLauncher(args=['/path/to/script'])

        dest_to_groups = defaultdict(set)

        for name, group in launcher.__dict__.items():
            if not name.endswith('_opt_group'):
                continue

            for option in group.option_list:
                dest_to_groups[option.dest].add(name)

        dest_to_multiple_groups = dict(
            (dest, groups) for dest, groups in dest_to_groups.items()
            if len(groups) > 1)

        self.assertEqual(dest_to_multiple_groups, {})
Пример #15
0
    def test_python3_jupyter_notebook(self):
        # regression test for #1441

        # this actually works on any Python platform, since we use mocks
        mock_stdin = Mock()
        mock_stdin.buffer = Mock()

        mock_stdout = Mock()
        del mock_stdout.buffer

        mock_stderr = Mock()
        del mock_stderr.buffer

        with patch.multiple(sys, stdin=mock_stdin,
                            stdout=mock_stdout, stderr=mock_stderr):
            launcher = MRJobLauncher(args=['/path/to/script'])

        self.assertEqual(launcher.stdin, mock_stdin.buffer)
        self.assertEqual(launcher.stdout, mock_stdout)
        self.assertEqual(launcher.stderr, mock_stderr)
Пример #16
0
    def _make_launcher(self, *args):
        """Make a launcher, add a mock runner (``launcher.mock_runner``), and
        set it up so that ``launcher.make_runner().__enter__()`` returns
        ``launcher.mock_runner()``.
        """
        launcher = MRJobLauncher(args=['--no-conf', ''] + list(args))
        launcher.sandbox()

        launcher.mock_runner = Mock()
        launcher.mock_runner.stream_output.return_value = [b'a line\n']

        launcher.make_runner = MagicMock()  # include __enter__
        launcher.make_runner.return_value.__enter__.return_value = (
            launcher.mock_runner)

        return launcher
Пример #17
0
def _run(args):
    from mrjob.launch import MRJobLauncher
    MRJobLauncher(args=args, from_cl=True).run_job()
Пример #18
0
 def test_emr_runner(self):
     launcher = MRJobLauncher(args=["--no-conf", "-r", "emr", ""])
     with no_handlers_for_logger("mrjob"):
         with patch_fs_s3():
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, EMRJobRunner)
Пример #19
0
 def test_local_runner(self):
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'local', ''])
     with no_handlers_for_logger('mrjob.runner'):
         with launcher.make_runner() as runner:
             self.assertIsInstance(runner, LocalMRJobRunner)
Пример #20
0
 def test_emr_runner(self):
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'emr', ''])
     with no_handlers_for_logger('mrjob'):
         with patch_fs_s3():
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, EMRJobRunner)
Пример #21
0
 def test_emr_runner(self):
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'emr', ''])
     with no_handlers_for_logger('mrjob.runner'):
         with launcher.make_runner() as runner:
             self.assertIsInstance(runner, EMRJobRunner)
Пример #22
0
 def test_local_runner(self):
     launcher = MRJobLauncher(args=["--no-conf", "-r", "local", ""])
     with no_handlers_for_logger("mrjob.runner"):
         with launcher.make_runner() as runner:
             self.assertIsInstance(runner, LocalMRJobRunner)