예제 #1
0
    def test_uniq_combiner(self):
        # put data in a .gz to force a single map taxsk
        x_gz_path = join(self.tmp_dir, 'data.gz')
        with gzip.open(x_gz_path, 'wb') as x_gz:
            x_gz.write(b'x\nx\nx\nx\nx\nx\n')

        job = MRCmdJob(['--combiner-cmd=uniq', '--runner=local', x_gz_path])
        job.sandbox()

        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
                'combiner': {
                    'type': 'command',
                    'command': 'uniq'
                }
            }])

            r.run()

            # there is only one map task, thus only one combiner,
            # thus there should only be one value
            self.assertEqual(b''.join(r.cat_output()), b'x\n')
예제 #2
0
    def test_multiple(self):
        data = b'x\nx\nx\nx\nx\nx\n'
        mapper_cmd = 'cat -e'
        reducer_cmd = _bash_wrap('wc -l | tr -Cd "[:digit:]"')
        job = MRCmdJob([
            '--runner', 'local', '--mapper-cmd', mapper_cmd, '--combiner-cmd',
            'uniq', '--reducer-cmd', reducer_cmd
        ])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': mapper_cmd
                },
                'combiner': {
                    'type': 'command',
                    'command': 'uniq'
                },
                'reducer': {
                    'type': 'command',
                    'command': reducer_cmd
                },
            }])

            r.run()

            self.assertEqual(list(r.stream_output()), [b'2'])
예제 #3
0
파일: test_local.py 프로젝트: Affirm/mrjob
    def test_multiple(self):
        # put data in a .gz to force a single map task
        x_gz_path = join(self.tmp_dir, 'data.gz')
        with gzip.open(x_gz_path, 'wb') as x_gz:
            x_gz.write(b'x\nx\nx\nx\nx\nx\n')

        reducer_cmd = '/bin/sh -c \'wc -l | tr -Cd "[:digit:]"\''
        job = MRCmdJob([
            '--runner', 'local',
            '--mapper-cmd', 'cat -e',
            '--combiner-cmd', 'uniq',
            '--reducer-cmd', reducer_cmd,
            x_gz_path])
        job.sandbox()

        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {'type': 'command', 'command': 'cat -e'},
                    'combiner': {'type': 'command', 'command': 'uniq'},
                    'reducer': {'type': 'command', 'command': reducer_cmd},
                }])

            r.run()

            self.assertEqual(
                sum(int(v) for _, v in job.parse_output(r.cat_output())),
                1)
예제 #4
0
파일: test_local.py 프로젝트: Affirm/mrjob
    def test_uniq_combiner(self):
        # put data in a .gz to force a single map taxsk
        x_gz_path = join(self.tmp_dir, 'data.gz')
        with gzip.open(x_gz_path, 'wb') as x_gz:
            x_gz.write(b'x\nx\nx\nx\nx\nx\n')

        job = MRCmdJob(['--combiner-cmd=uniq', '--runner=local', x_gz_path])
        job.sandbox()

        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'script',
                    },
                    'combiner': {
                        'type': 'command',
                        'command': 'uniq'}}])

            r.run()

            # there is only one map task, thus only one combiner,
            # thus there should only be one value
            self.assertEqual(b''.join(r.cat_output()), b'x\n')
예제 #5
0
 def test_multiple_2(self):
     data = b'x\ny\nz\n'
     job = MRCmdJob(['--mapper-cmd=cat', '--reducer-cmd-2', 'wc -l',
                     '--runner=local', '--no-conf'])
     job.sandbox(stdin=BytesIO(data))
     with job.make_runner() as r:
         r.run()
         self.assertEqual(sum(int(l) for l in to_lines(r.cat_output())), 3)
예제 #6
0
파일: test_local.py 프로젝트: Affirm/mrjob
 def test_multiple_2(self):
     data = b'x\ny\nz\n'
     job = MRCmdJob(['--mapper-cmd=cat', '--reducer-cmd-2', 'wc -l',
                     '--runner=local', '--no-conf'])
     job.sandbox(stdin=BytesIO(data))
     with job.make_runner() as r:
         r.run()
         self.assertEqual(sum(int(l) for l in to_lines(r.cat_output())), 3)
예제 #7
0
    def test_cat_mapper(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--mapper-cmd=cat', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': 'cat'
                }
            }])

            r.run()
            lines = [line.strip() for line in list(r.stream_output())]
            self.assertEqual(sorted(lines), sorted(data.split()))
예제 #8
0
파일: test_local.py 프로젝트: Affirm/mrjob
    def test_cat_mapper(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--mapper-cmd=cat', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'command',
                        'command': 'cat'}}])

            r.run()
            lines = [line.strip() for line in to_lines(r.cat_output())]
            self.assertEqual(sorted(lines), sorted(data.split()))
예제 #9
0
    def test_command_streaming_step_without_mr_job_script(self):
        # you don't need a script to run commands
        steps = MRCmdJob(['--mapper-cmd', 'cat'])._steps_desc()

        runner = LocalMRJobRunner(steps=steps, stdin=BytesIO(b'dog\n'))

        runner.run()
        runner.cleanup()
예제 #10
0
    def test_cat_reducer(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
                'reducer': {
                    'type': 'command',
                    'command': 'cat -e'
                }
            }])

            r.run()

            lines = list(r.stream_output())
            self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
예제 #11
0
파일: test_local.py 프로젝트: Affirm/mrjob
    def test_cat_reducer(self):
        data = b'x\ny\nz\n'
        job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(
                r._get_steps(),
                [{
                    'type': 'streaming',
                    'mapper': {
                        'type': 'script',
                    },
                    'reducer': {
                        'type': 'command',
                        'command': 'cat -e'}}])

            r.run()

            lines = list(to_lines(r.cat_output()))
            self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
예제 #12
0
    def test_multiple(self):
        # put data in a .gz to force a single map task
        x_gz_path = join(self.tmp_dir, 'data.gz')
        with gzip.open(x_gz_path, 'wb') as x_gz:
            x_gz.write(b'x\nx\nx\nx\nx\nx\n')

        reducer_cmd = _bash_wrap('wc -l | tr -Cd "[:digit:]"')
        job = MRCmdJob([
            '--runner', 'local', '--mapper-cmd', 'cat -e', '--combiner-cmd',
            'uniq', '--reducer-cmd', reducer_cmd, x_gz_path
        ])
        job.sandbox()

        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'command',
                    'command': 'cat -e'
                },
                'combiner': {
                    'type': 'command',
                    'command': 'uniq'
                },
                'reducer': {
                    'type': 'command',
                    'command': reducer_cmd
                },
            }])

            r.run()

            self.assertEqual(
                sum(int(v) for _, v in job.parse_output(r.cat_output())), 1)
예제 #13
0
    def test_uniq_combiner(self):
        data = b'x\nx\nx\nx\nx\nx\n'
        job = MRCmdJob(['--combiner-cmd=uniq', '--runner=local'])
        job.sandbox(stdin=BytesIO(data))
        with job.make_runner() as r:
            self.assertEqual(r._get_steps(), [{
                'type': 'streaming',
                'mapper': {
                    'type': 'script',
                },
                'combiner': {
                    'type': 'command',
                    'command': 'uniq'
                }
            }])

            r.run()

            # there are 2 map tasks, each of which has 1 combiner, and all rows
            # are the same, so we should end up with just 2 values

            self.assertEqual(b''.join(r.stream_output()), b'x\nx\n')
예제 #14
0
    def test_passthrough_options(self):
        MRCmdJob(['--help'])
        self.exit.assert_called_once_with(0)

        output = self.stdout.getvalue()
        self.assertIn('--reducer-cmd-2', output)
예제 #15
0
파일: test_inline.py 프로젝트: qui/mrjob
    def test_no_command_steps(self):
        job = MRCmdJob(['-r', 'inline', '--mapper-cmd', 'cat'])
        job.sandbox()

        self.assertRaises(NotImplementedError, job.make_runner)
예제 #16
0
파일: test_inline.py 프로젝트: Yelp/mrjob
    def test_no_command_steps(self):
        job = MRCmdJob(['-r', 'inline', '--mapper-cmd', 'cat'])
        job.sandbox()

        self.assertRaises(NotImplementedError, job.make_runner)