Пример #1
0
    def test_no_mapper(self):
        # read from STDIN, a local file, and a remote file
        stdin = BytesIO(b'foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'wb') as local_input_file:
            local_input_file.write(b'one fish two fish\nred fish blue fish\n')

        remote_input_path = 'gs://walrus/data/foo'
        self.put_gcs_multi({remote_input_path: b'foo\n'})

        mr_job = MRNoMapper(
            ['-r', 'dataproc', '-v', '-', local_input_path, remote_input_path])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # setup fake output
            self.put_job_output_parts(
                runner,
                [b'1\t["blue", "one", "red", "two"]\n', b'4\t["fish"]\n'])

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        self.assertEqual(sorted(results), [(1, ['blue', 'one', 'red', 'two']),
                                           (4, ['fish'])])
Пример #2
0
    def test_no_mapper(self):
        # read from STDIN, a local file, and a remote file
        stdin = BytesIO(b'foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'wb') as local_input_file:
            local_input_file.write(b'one fish two fish\nred fish blue fish\n')

        remote_input_path = 'gs://walrus/data/foo'
        self.put_gcs_multi({
            remote_input_path: b'foo\n'
        })

        mr_job = MRNoMapper(['-r', 'dataproc', '-v',
                             '-', local_input_path, remote_input_path])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # setup fake output
            self.put_job_output_parts(runner, [
                b'1\t["blue", "one", "red", "two"]\n',
                b'4\t["fish"]\n'])

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        self.assertEqual(sorted(results),
                          [(1, ['blue', 'one', 'red', 'two']),
                           (4, ['fish'])])
Пример #3
0
    def test_no_mapper(self):
        # read from STDIN, a local file, and a remote file
        stdin = BytesIO(b"foo\nbar\n")

        local_input_path = os.path.join(self.tmp_dir, "input")
        with open(local_input_path, "wb") as local_input_file:
            local_input_file.write(b"one fish two fish\nred fish blue fish\n")

        remote_input_path = "gs://walrus/data/foo"
        self.put_gcs_multi({remote_input_path: b"foo\n"})

        mr_job = MRNoMapper(["-r", "dataproc", "-v", "-", local_input_path, remote_input_path])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # setup fake output
            self.put_job_output_parts(runner, [b'1\t["blue", "one", "red", "two"]\n', b'4\t["fish"]\n'])

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        self.assertEqual(sorted(results), [(1, ["blue", "one", "red", "two"]), (4, ["fish"])])
Пример #4
0
    def test_step_with_no_mapper(self):
        mr_job = MRNoMapper(['-r', self.RUNNER])

        mr_job.sandbox(stdin=BytesIO(
            b'one fish two fish\nred fish blue fish\n'))

        with mr_job.make_runner() as runner:
            runner.run()

            results = [mr_job.parse_output_line(line)
                       for line in runner.stream_output()]

            self.assertEqual(sorted(results),
                             [(1, ['blue', 'one', 'red', 'two']),
                              (4, ['fish'])])