Пример #1
0
    def test_multi_step_counters(self):
        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRCountingJob(['-r', 'local', '-'])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            runner.run()

            self.assertEqual(runner.counters(),
                             [{'group': {'counter_name': 2}},
                              {'group': {'counter_name': 2}},
                              {'group': {'counter_name': 2}}])
Пример #2
0
    def test_multi_step_counters(self):
        stdin = StringIO('foo\nbar\n')

        mr_job = MRCountingJob(['-c', self.mrjob_conf_path, '-'])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            runner.run()

            self.assertEqual(runner.counters(),
                             [{'group': {'counter_name': 2}},
                              {'group': {'counter_name': 2}},
                              {'group': {'counter_name': 2}}])
Пример #3
0
    def test_output_dir_and_step_output_dir(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        # this has three steps, which lets us test step numbering
        job = MRCountingJob([
            '-r',
            'hadoop',
            '--hadoop-bin',
            'false',  # shouldn't run; just in case
            '--output-dir',
            'hdfs:///tmp/output',
            '--step-output-dir',
            'hdfs://tmp/step-output',
            input1_path,
            input2_path
        ])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertEqual(runner._num_steps(), 3)

            self.add_files_for_upload(runner)

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])

            output_uri_0 = runner._step_output_uri(0)
            self.assertEqual(output_uri_0, 'hdfs://tmp/step-output/0000')

            input_uris_1 = runner._step_input_uris(1)
            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1, 'hdfs://tmp/step-output/0001')

            input_uris_2 = runner._step_input_uris(2)
            self.assertEqual(input_uris_2, [output_uri_1])

            output_uri_2 = runner._step_output_uri(2)
            self.assertEqual(output_uri_2, 'hdfs:///tmp/output')
Пример #4
0
    def test_local_output_dir_and_step_output_dir(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        output_dir = self.makedirs('output')
        step_output_dir = self.makedirs('step_output')

        # this has three steps, which lets us test step numbering
        job = MRCountingJob([
            '-r', 'local',
            '--output-dir', output_dir,
            '--step-output-dir', step_output_dir,
            input1_path, input2_path])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertEqual(runner._num_steps(), 3)

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])
            self.assertEqual([uri[:8] for uri in input_uris_0],
                             ['file:///', 'file:///'])

            output_uri_0 = runner._step_output_uri(0)
            self.assertEqual(output_uri_0,
                             to_uri(os.path.join(step_output_dir, '0000')))

            input_uris_1 = runner._step_input_uris(1)
            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1,
                             to_uri(os.path.join(step_output_dir, '0001')))

            input_uris_2 = runner._step_input_uris(2)
            self.assertEqual(input_uris_2, [output_uri_1])

            output_uri_2 = runner._step_output_uri(2)
            self.assertEqual(output_uri_2, to_uri(output_dir))
Пример #5
0
    def test_multi_step_counters(self):
        # read from STDIN, a regular file, and a .gz
        stdin = StringIO('foo\nbar\n')

        mr_job = MRCountingJob(['-c', self.mrjob_conf_path, '-'])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            assert_equal(runner._counters, [{
                'group': {
                    'counter_name': 2
                }
            }, {
                'group': {
                    'counter_name': 2
                }
            }, {
                'group': {
                    'counter_name': 2
                }
            }])
Пример #6
0
    def test_gz_split_regression(self):
        gz_path_1 = os.path.join(self.tmp_dir, '1.gz')
        gz_path_2 = os.path.join(self.tmp_dir, '2.gz')
        path_3 = os.path.join(self.tmp_dir, '3')

        input_gz_1 = gzip.GzipFile(gz_path_1, 'wb')
        input_gz_1.write(b'x\n')
        input_gz_1.close()

        input_gz_2 = gzip.GzipFile(gz_path_2, 'wb')
        input_gz_2.write(b'y\n')
        input_gz_2.close()

        with open(path_3, 'wb') as f:
            f.write(b'z')

        mr_job = MRCountingJob(
            ['--no-conf', '-r', 'local', gz_path_1, gz_path_2, path_3])
        with mr_job.make_runner() as r:
            splits = r._get_file_splits([gz_path_1, gz_path_2, path_3], 1)
            self.assertEqual(len(set(s['task_num'] for s in splits.values())),
                             3)