示例#1
0
    def test_doesnt_actually_create_archive(self):
        archive_dir = self.makedirs('archive')

        runner = InlineMRJobRunner()
        archive_path = runner._dir_archive_path(archive_dir)

        self.assertFalse(os.path.exists(archive_path))
示例#2
0
    def test_auto_owner(self):
        os.environ["USER"] = "******"
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "no_script")
        self.assertEqual(match.group(2), "mcp")
示例#3
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "no_script")
        self.assertEqual(match.group(2), "no_user")
示例#4
0
    def test_owner_and_label_kwargs(self):
        runner = InlineMRJobRunner(conf_path=False,
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
示例#5
0
文件: test_conf.py 项目: icio/mrjob
 def test_getattr_forward(self):
     with no_handlers_for_logger():
         r = InlineMRJobRunner(conf_path=False)
     store = r._opts
     self.assertIsInstance(store, InlineRunnerOptionStore)
     a = r.get_default_opts()
     self.assertEqual(a, store.default_options())
示例#6
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = InlineMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'mcp')
示例#7
0
    def test_trailing_slash(self):
        archive_dir = self.makedirs('archive') + os.sep

        runner = InlineMRJobRunner()
        archive_path = runner._dir_archive_path(archive_dir)

        self.assertEqual(os.path.basename(archive_path), 'archive.tar.gz')
示例#8
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = InlineMRJobRunner(conf_paths=[])
        match = _JOB_KEY_RE.match(runner.get_job_key())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'mcp')
示例#9
0
    def test_owner_and_label_kwargs(self):
        runner = InlineMRJobRunner(conf_paths=[],
                                   owner='ads', label='ads_chain')
        match = _JOB_KEY_RE.match(runner.get_job_key())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
示例#10
0
    def test_stream_output(self):
        a_dir_path = os.path.join(self.tmp_dir, "a")
        b_dir_path = os.path.join(self.tmp_dir, "b")
        l_dir_path = os.path.join(self.tmp_dir, "_logs")
        os.mkdir(a_dir_path)
        os.mkdir(b_dir_path)
        os.mkdir(l_dir_path)

        a_file_path = os.path.join(a_dir_path, "part-00000")
        b_file_path = os.path.join(b_dir_path, "part-00001")
        c_file_path = os.path.join(self.tmp_dir, "part-00002")
        x_file_path = os.path.join(l_dir_path, "log.xml")
        y_file_path = os.path.join(self.tmp_dir, "_SUCCESS")

        with open(a_file_path, "w") as f:
            f.write("A")

        with open(b_file_path, "w") as f:
            f.write("B")

        with open(c_file_path, "w") as f:
            f.write("C")

        with open(x_file_path, "w") as f:
            f.write("<XML XML XML/>")

        with open(y_file_path, "w") as f:
            f.write("I win")

        runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir)
        self.assertEqual(sorted(runner.stream_output()), ["A", "B", "C"])
示例#11
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = InlineMRJobRunner(conf_paths=[])
        match = _JOB_KEY_RE.match(runner.get_job_key())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), 'no_user')
示例#12
0
    def test_stream_output(self):
        a_dir_path = os.path.join(self.tmp_dir, 'a')
        b_dir_path = os.path.join(self.tmp_dir, 'b')
        l_dir_path = os.path.join(self.tmp_dir, '_logs')
        os.mkdir(a_dir_path)
        os.mkdir(b_dir_path)
        os.mkdir(l_dir_path)

        a_file_path = os.path.join(a_dir_path, 'part-00000')
        b_file_path = os.path.join(b_dir_path, 'part-00001')
        c_file_path = os.path.join(self.tmp_dir, 'part-00002')
        x_file_path = os.path.join(l_dir_path, 'log.xml')
        y_file_path = os.path.join(self.tmp_dir, '_SUCCESS')

        with open(a_file_path, 'w') as f:
            f.write('A')

        with open(b_file_path, 'w') as f:
            f.write('B')

        with open(c_file_path, 'w') as f:
            f.write('C')

        with open(x_file_path, 'w') as f:
            f.write('<XML XML XML/>')

        with open(y_file_path, 'w') as f:
            f.write('I win')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir)
        self.assertEqual(sorted(runner.stream_output()),
                         [b'A', b'B', b'C'])
示例#13
0
    def test_same_dir_twice(self):
        archive_dir = self.makedirs('archive')

        runner = InlineMRJobRunner()
        archive_path_1 = runner._dir_archive_path(archive_dir)
        archive_path_2 = runner._dir_archive_path(archive_dir)

        self.assertEqual(os.path.basename(archive_path_1), 'archive.tar.gz')
        self.assertEqual(archive_path_1, archive_path_2)
示例#14
0
class UpdateJobConfForHadoopVersionTestCase(TestCase):

    # jobconf with strange mix of Hadoop 1 and Hadoop 2 variables
    JOBCONF = {
        'foo.bar': 'baz',                   # unknown jobconf
        'mapred.jar': 'a.jar',              # Hadoop 1 jobconf
        'mapreduce.job.user.name': 'dave',  # Hadoop 2 jobconf
    }

    def setUp(self):
        self.runner = InlineMRJobRunner(conf_paths=[])

    def updated_and_warnings(self, jobconf, hadoop_version):
        jobconf = jobconf.copy()
        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)
            self.runner._update_jobconf_for_hadoop_version(
                jobconf, hadoop_version)

        return jobconf, stderr.getvalue()

    def test_no_version(self):
        updated, warnings = self.updated_and_warnings(
            self.JOBCONF, None)

        self.assertEqual(updated, self.JOBCONF)
        self.assertEqual(warnings, '')

    def test_hadoop_1(self):
        updated, warnings = self.updated_and_warnings(
            self.JOBCONF, '1.0')

        self.assertEqual(updated,
                         combine_dicts(self.JOBCONF, {'user.name': 'dave'}))
        self.assertIn('do not match hadoop version', warnings)
        self.assertIn('mapreduce.job.user.name: user.name', warnings)

    def test_hadoop_2(self):
        updated, warnings = self.updated_and_warnings(
            self.JOBCONF, '2.0')

        self.assertEqual(updated,
                         combine_dicts(self.JOBCONF,
                                       {'mapreduce.job.jar': 'a.jar'}))
        self.assertIn('do not match hadoop version', warnings)
        self.assertIn('mapred.jar: mapreduce.job.jar', warnings)

    def test_dont_overwrite(self):
        # this jobconf contains two versions of the same variable
        jobconf = {'mapred.jar': 'a.jar', 'mapreduce.job.jar': 'b.jar'}

        updated, warnings = self.updated_and_warnings(jobconf, '1.0')

        self.assertEqual(updated, jobconf)
        self.assertEqual(warnings, '')
示例#15
0
    def test_mrjob_zip_compiles(self):
        runner = InlineMRJobRunner()
        with no_handlers_for_logger('mrjob.runner'):
            mrjob_zip = runner._create_mrjob_zip()

        ZipFile(mrjob_zip).extractall(self.tmp_dir)

        self.assertTrue(
            compileall.compile_dir(os.path.join(self.tmp_dir, 'mrjob'),
                                   quiet=1))
示例#16
0
    def test_output_dir_not_considered_hidden(self):
        output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir')

        self.makefile(os.path.join(output_dir, 'part-00000'),
                      b'cats\n')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir)

        self.assertEqual(sorted(to_lines(runner.stream_output())),
                         [b'cats\n'])
示例#17
0
    def test_dirs_with_same_name(self):
        foo_archive = self.makedirs(os.path.join('foo', 'archive'))
        bar_archive = self.makedirs(os.path.join('bar', 'archive'))

        runner = InlineMRJobRunner()
        foo_archive_path = runner._dir_archive_path(foo_archive)
        bar_archive_path = runner._dir_archive_path(bar_archive)

        self.assertEqual(os.path.basename(foo_archive_path),
                         'archive.tar.gz')
        self.assertNotEqual(foo_archive_path, bar_archive_path)
示例#18
0
    def test_empty_dir(self):
        runner = InlineMRJobRunner()

        empty_dir = self.makedirs('empty')

        tar_gz_path = runner._dir_archive_path(empty_dir)
        self.assertEqual(os.path.basename(tar_gz_path), 'empty.tar.gz')

        runner._create_dir_archive(empty_dir)

        with tarfile.open(tar_gz_path, 'r:gz') as tar_gz:
            self.assertEqual(sorted(tar_gz.getnames()), [])
示例#19
0
    def test_archive(self):
        runner = InlineMRJobRunner()

        tar_gz_path = runner._dir_archive_path(self._to_archive)
        self.assertEqual(os.path.basename(tar_gz_path), 'archive.tar.gz')

        runner._create_dir_archive(self._to_archive)

        tar_gz = tarfile.open(tar_gz_path, 'r:gz')
        try:
            self.assertEqual(sorted(tar_gz.getnames()),
                             [os.path.join('bar', 'baz'), 'foo'])
        finally:
            tar_gz.close()
示例#20
0
    def test_deprecated_stream_output(self):
        self.makefile('part-00000', contents=b'1\n2')
        self.makefile('part-00001', contents=b'3\n4\n')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir)

        log = self.start(patch('mrjob.runner.log'))

        # should group output into lines, but not join across files
        self.assertEqual(sorted(runner.stream_output()),
                         [b'1\n', b'2', b'3\n', b'4\n'])

        # should issue deprecation warning
        self.assertEqual(log.warning.call_count, 1)
示例#21
0
class UpdateJobConfForHadoopVersionTestCase(TestCase):

    # jobconf with strange mix of Hadoop 1 and Hadoop 2 variables
    JOBCONF = {
        "foo.bar": "baz",  # unknown jobconf
        "mapred.jar": "a.jar",  # Hadoop 1 jobconf
        "mapreduce.job.user.name": "dave",  # Hadoop 2 jobconf
    }

    def setUp(self):
        self.runner = InlineMRJobRunner(conf_paths=[])

    def updated_and_warnings(self, jobconf, hadoop_version):
        jobconf = jobconf.copy()
        with no_handlers_for_logger("mrjob.runner"):
            stderr = StringIO()
            log_to_stream("mrjob.runner", stderr)
            self.runner._update_jobconf_for_hadoop_version(jobconf, hadoop_version)

        return jobconf, stderr.getvalue()

    def test_no_version(self):
        updated, warnings = self.updated_and_warnings(self.JOBCONF, None)

        self.assertEqual(updated, self.JOBCONF)
        self.assertEqual(warnings, "")

    def test_hadoop_1(self):
        updated, warnings = self.updated_and_warnings(self.JOBCONF, "1.0")

        self.assertEqual(updated, combine_dicts(self.JOBCONF, {"user.name": "dave"}))
        self.assertIn("do not match hadoop version", warnings)
        self.assertIn("mapreduce.job.user.name: user.name", warnings)

    def test_hadoop_2(self):
        updated, warnings = self.updated_and_warnings(self.JOBCONF, "2.0")

        self.assertEqual(updated, combine_dicts(self.JOBCONF, {"mapreduce.job.jar": "a.jar"}))
        self.assertIn("do not match hadoop version", warnings)
        self.assertIn("mapred.jar: mapreduce.job.jar", warnings)

    def test_dont_overwrite(self):
        # this jobconf contains two versions of the same variable
        jobconf = {"mapred.jar": "a.jar", "mapreduce.job.jar": "b.jar"}

        updated, warnings = self.updated_and_warnings(jobconf, "1.0")

        self.assertEqual(updated, jobconf)
        self.assertEqual(warnings, "")
示例#22
0
    def setUp(self):
        super(TestCatOutput, self).setUp()

        self.output_dir = os.path.join(self.tmp_dir, 'job_output')
        os.mkdir(self.output_dir)

        self.runner = InlineMRJobRunner(
            conf_paths=[], output_dir=self.output_dir)
示例#23
0
    def test_no_script_and_no_steps(self):
        runner = InlineMRJobRunner()

        self.assertEqual(runner._script_path, None)
        self.assertEqual(runner._steps, [])

        self.assertRaises(ValueError, runner.run)

        self.assertFalse(self.log.warning.called)
示例#24
0
    def _test_cleanup_after_with_statement(self, mode, should_exist):
        local_tmp_dir = None

        with InlineMRJobRunner(cleanup=mode, conf_paths=[]) as runner:
            local_tmp_dir = runner._get_local_tmp_dir()
            self.assertTrue(os.path.exists(local_tmp_dir))

        # leaving the with: block activates cleanup
        self.assertEqual(os.path.exists(local_tmp_dir), should_exist)
示例#25
0
    def test_option_debug_printout(self):
        log = self.start(patch('mrjob.runner.log'))

        InlineMRJobRunner(owner='dave')

        debug = ''.join(a[0] + '\n' for a, kw in log.debug.call_args_list)

        self.assertIn("'owner'", debug)
        self.assertIn("'dave'", debug)
示例#26
0
    def test_extra_kwargs_passed_in_directly_okay(self):
        runner = InlineMRJobRunner(
            foo='bar',
            local_tmp_dir='/var/tmp',
            conf_paths=[],
        )

        self.assertEqual(runner._opts['local_tmp_dir'], '/var/tmp')
        self.assertNotIn('bar', runner._opts)
示例#27
0
    def test_multiple_configs_via_runner_args(self):
        path_left = self.save_conf('left.conf', self.BASE_CONFIG_LEFT)
        path_right = self.save_conf('right.conf', self.BASE_CONFIG_RIGHT)

        runner = InlineMRJobRunner(conf_paths=[path_left, path_right])

        self.assertEqual(
            runner._opts['jobconf'],
            dict(from_left='one', from_both='two', from_right='two'))
示例#28
0
    def test_create_mrjob_tar_gz(self):
        with InlineMRJobRunner(conf_paths=[]) as runner:
            mrjob_tar_gz_path = runner._create_mrjob_tar_gz()
            mrjob_tar_gz = tarfile.open(mrjob_tar_gz_path)
            contents = mrjob_tar_gz.getnames()

            for path in contents:
                self.assertEqual(path[:6], 'mrjob/')

            self.assertIn('mrjob/job.py', contents)
示例#29
0
    def test_option_debug_printout(self):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            InlineMRJobRunner(owner='dave')

        self.assertIn("'owner'", stderr.getvalue())
        self.assertIn("'dave'", stderr.getvalue())
示例#30
0
    def test_empty_runner_error(self):
        conf = dict(runner=dict(local=dict(base_tmp_dir='/tmp')))
        path = self.save_conf('basic', conf)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr)
            InlineMRJobRunner(conf_paths=[path])
            self.assertEqual("No configs specified for inline runner\n",
                             stderr.getvalue())
示例#31
0
    def test_file_uris_only(self):
        runner = InlineMRJobRunner()

        # sanity check
        foo_path = self.makefile('foo')
        bar_path = join(self.tmp_dir, 'bar')
        self.assertTrue(runner.fs.exists(foo_path))
        self.assertFalse(runner.fs.exists('file://' + bar_path))

        # non-file:/// URI should raise IOError, not return False
        self.assertRaises(IOError, runner.fs.exists, 's3://walrus/fish')
示例#32
0
    def test_recurse(self):
        path = os.path.join(self.tmp_dir, 'LOL.conf')
        recurse_conf = dict(include=path)
        with open(path, 'w') as f:
            dump_mrjob_conf(recurse_conf, f)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.conf', stderr)
            InlineMRJobRunner(conf_path=path)
            self.assertIn('%s tries to recursively include %s!' % (path, path),
                          stderr.getvalue())
示例#33
0
    def test_create_mrjob_zip(self):
        with no_handlers_for_logger('mrjob.runner'):
            with InlineMRJobRunner(conf_paths=[]) as runner:
                mrjob_zip_path = runner._create_mrjob_zip()
                mrjob_zip = ZipFile(mrjob_zip_path)
                contents = mrjob_zip.namelist()

                for path in contents:
                    self.assertEqual(path[:6], 'mrjob/')

                self.assertIn('mrjob/job.py', contents)
                for filename in contents:
                    self.assertFalse(filename.endswith('.pyc'),
                                     msg="%s ends with '.pyc'" % filename)
示例#34
0
    def test_no_uris(self):
        runner = InlineMRJobRunner()

        # sanity check
        foo_path = self.makefile('foo')
        bar_path = os.path.join(self.tmp_dir, 'bar')
        self.assertTrue(runner.fs.exists(foo_path))
        self.assertFalse(runner.fs.exists(bar_path))

        # URI should raise IOError, not return False
        self.assertRaises(IOError,
                          runner.fs.exists, 's3://walrus/fish')
        # and it's because we wrapped the local fs in CompositeFilesystem
        self.assertFalse(runner.fs.local.exists('s3://walrus/fish'))
示例#35
0
    def test_only_create_archive_once(self):
        runner = InlineMRJobRunner()

        tar_gz_path = runner._dir_archive_path(self._to_archive)

        runner._create_dir_archive(self._to_archive)
        mtime_1 = os.stat(tar_gz_path).st_mtime

        sleep(1)
        runner._create_dir_archive(self._to_archive)
        mtime_2 = os.stat(tar_gz_path).st_mtime

        self.assertEqual(mtime_1, mtime_2)
示例#36
0
    def test_passthrough(self):
        runner = InlineMRJobRunner()

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner.ls, runner.fs.ls)
            # no special rules for underscore methods
            self.assertEqual(runner._cat_file, runner.fs._cat_file)

            self.assertIn(
                'deprecated: call InlineMRJobRunner.fs.ls() directly',
                stderr.getvalue())
            self.assertIn(
                'deprecated: call InlineMRJobRunner.fs._cat_file() directly',
                stderr.getvalue())
示例#37
0
    def test_only_create_archive_once(self):
        runner = InlineMRJobRunner()

        tar_gz_path = runner._dir_archive_path(self._to_archive)

        runner._create_dir_archive(self._to_archive)
        mtime_1 = os.stat(tar_gz_path).st_mtime

        sleep(1)
        runner._create_dir_archive(self._to_archive)
        mtime_2 = os.stat(tar_gz_path).st_mtime

        self.assertEqual(mtime_1, mtime_2)
示例#38
0
    def test_conf_contain_only_include_file(self):
        """If a config file only include other configuration files
        no warnings are thrown as long as the included files are
        not empty.
        """

        # dummy configuration for include file 1
        conf = {
            'runners': {
                'inline': {
                    'local_tmp_dir': "include_file1_local_tmp_dir"
                }
            }
        }

        include_file_1 = self.save_conf('include_file_1', conf)

        # dummy configuration for include file 2
        conf = {
            'runners': {
                'inline': {
                    'local_tmp_dir': "include_file2_local_tmp_dir"
                }
            }
        }

        include_file_2 = self.save_conf('include_file_2', conf)

        # test configuration
        conf = {
            'include': [include_file_1, include_file_2]
        }
        path = self.save_conf('twoincludefiles', conf)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.conf', stderr)
            InlineMRJobRunner(conf_paths=[path])
            self.assertEqual(
                "",
                stderr.getvalue())
示例#39
0
文件: job.py 项目: mtai/mrjob
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        bad_words = ('--steps', '--mapper', '--reducer', '--combiner',
                     '--step-num')
        for w in bad_words:
            if w in sys.argv:
                raise UsageError("make_runner() was called with %s. This"
                                 " probably means you tried to use it from"
                                 " __main__, which doesn't work." % w)

        # support inline runner when running from the MRJob itself
        from mrjob.inline import InlineMRJobRunner

        if self.options.runner == 'inline':
            return InlineMRJobRunner(mrjob_cls=self.__class__,
                                     **self.inline_job_runner_kwargs())

        return super(MRJob, self).make_runner()
示例#40
0
    def test_conf_contain_only_include_file(self):
        """If a config file only include other configuration files
        no warnings are thrown as long as the included files are
        not empty.
        """

        # dummy configuration for include file 1
        conf = {
            'runners': {
                'inline': {
                    'local_tmp_dir': "include_file1_local_tmp_dir"
                }
            }
        }

        include_file_1 = self.save_conf('include_file_1', conf)

        # dummy configuration for include file 2
        conf = {
            'runners': {
                'inline': {
                    'local_tmp_dir': "include_file2_local_tmp_dir"
                }
            }
        }

        include_file_2 = self.save_conf('include_file_2', conf)

        # test configuration
        conf = {
            'include': [include_file_1, include_file_2]
        }
        path = self.save_conf('twoincludefiles', conf)

        InlineMRJobRunner(conf_paths=[path])
        self.assertFalse(self.log.called)
示例#41
0
 def setUp(self):
     super(LocalFSTestCase, self).setUp()
     self.runner = InlineMRJobRunner()
示例#42
0
    def test_empty(self):
        runner = InlineMRJobRunner(conf_paths=[])
        match = _JOB_KEY_RE.match(runner.get_job_key())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), getpass.getuser())
示例#43
0
    def setUp(self):
        super(ClearTagTestCase, self).setUp()

        self.base_conf_path = self.save_conf('base.conf', self.BASE_CONF)
        runner = InlineMRJobRunner(conf_paths=[self.base_conf_path])
        self.base_opts = runner._opts
示例#44
0
    def test_file(self):
        qux_path = self.makefile('qux')

        runner = InlineMRJobRunner()

        self.assertRaises(OSError, runner._create_dir_archive, qux_path)
示例#45
0
 def opts_for_conf(self, name, conf):
     conf_path = self.save_conf(name, conf)
     runner = InlineMRJobRunner(conf_paths=[conf_path])
     return runner._opts
示例#46
0
    def test_uri(self):
        # we don't check whether URIs exist or are directories
        runner = InlineMRJobRunner()
        archive_path = runner._dir_archive_path('s3://bucket/stuff')

        self.assertEqual(os.path.basename(archive_path), 'stuff.tar.gz')
示例#47
0
    def test_uri(self):
        # we don't check whether URIs exist or are directories
        runner = InlineMRJobRunner()
        archive_path = runner._dir_archive_path('s3://bucket/stuff')

        self.assertEqual(os.path.basename(archive_path), 'stuff.tar.gz')
示例#48
0
class TestCatOutput(SandboxedTestCase):

    def setUp(self):
        super(TestCatOutput, self).setUp()

        self.output_dir = os.path.join(self.tmp_dir, 'job_output')
        os.mkdir(self.output_dir)

        self.runner = InlineMRJobRunner(
            conf_paths=[], output_dir=self.output_dir)

    def test_empty(self):
        self.assertEqual(list(self.runner.cat_output()), [])

    def test_typical_output(self):
        # actual output
        self.makefile(os.path.join(self.output_dir, 'part-00000'),
                      b'line0\n')
        self.makefile(os.path.join(self.output_dir, 'part-00001'),
                      b'line1\n')

        # hidden .crc file
        self.makefile(os.path.join(self.output_dir, '.crc.part-00000'),
                      b'42\n')

        # hidden _SUCCESS file (ignore)
        self.makefile(os.path.join(self.output_dir, '_SUCCESS'),
                      b'such a relief!\n')

        # hidden _logs dir
        self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'),
                      b'pretty much the usual\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line0\n', b'line1\n'])

    def test_output_in_subdirs(self):
        # test for output being placed in subdirs, for example with nicknack
        self.makefile(os.path.join(self.output_dir, 'a', 'part-00000'),
                      b'line-a0\n')
        self.makefile(os.path.join(self.output_dir, 'a', 'part-00001'),
                      b'line-a1\n')

        self.makefile(os.path.join(self.output_dir, 'b', 'part-00000'),
                      b'line-b0\n')

        self.makefile(os.path.join(self.output_dir, 'b', '.crc.part-00000'),
                      b'42\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line-a0\n', b'line-a1\n', b'line-b0\n'])

    def test_read_all_non_hidden_files(self):
        self.makefile(os.path.join(self.output_dir, 'baz'),
                      b'qux\n')

        self.makefile(os.path.join(self.output_dir, 'foo', 'bar'),
                      b'baz\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'baz\n', b'qux\n'])

    def test_empty_string_between_files(self):
        self.makefile(os.path.join(self.output_dir, 'part-00000'), b'A')
        self.makefile(os.path.join(self.output_dir, 'part-00001'), b'\n')
        self.makefile(os.path.join(self.output_dir, 'part-00002'), b'C')

        # order isn't guaranteed, but there should be 3 chunks separated
        # by two empty strings
        chunks = list(self.runner.cat_output())
        self.assertEqual(len(chunks), 5)
        self.assertEqual(chunks[1], b'')
        self.assertEqual(chunks[3], b'')

    def test_output_dir_not_considered_hidden(self):
        output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir')

        self.makefile(os.path.join(output_dir, 'part-00000'),
                      b'cats\n')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir)

        self.assertEqual(sorted(to_lines(runner.cat_output())),
                         [b'cats\n'])
示例#49
0
    def test_file(self):
        foo_file = self.makefile('foo')

        runner = InlineMRJobRunner()

        self.assertRaises(OSError, runner._dir_archive_path, foo_file)
示例#50
0
 def test_missing_input(self):
     runner = InlineMRJobRunner(input_paths=['/some/bogus/file/path'])
     self.assertRaises(Exception, runner._run)
示例#51
0
class TestCatOutput(SandboxedTestCase):

    def setUp(self):
        super(TestCatOutput, self).setUp()

        self.output_dir = os.path.join(self.tmp_dir, 'job_output')
        os.mkdir(self.output_dir)

        self.runner = InlineMRJobRunner(
            conf_paths=[], output_dir=self.output_dir)

    def test_empty(self):
        self.assertEqual(list(self.runner.cat_output()), [])

    def test_typical_output(self):
        # actual output
        self.makefile(os.path.join(self.output_dir, 'part-00000'),
                      b'line0\n')
        self.makefile(os.path.join(self.output_dir, 'part-00001'),
                      b'line1\n')

        # hidden .crc file
        self.makefile(os.path.join(self.output_dir, '.crc.part-00000'),
                      b'42\n')

        # hidden _SUCCESS file (ignore)
        self.makefile(os.path.join(self.output_dir, '_SUCCESS'),
                      b'such a relief!\n')

        # hidden _logs dir
        self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'),
                      b'pretty much the usual\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line0\n', b'line1\n'])

    def test_output_in_subdirs(self):
        # test for output being placed in subdirs, for example with nicknack
        self.makefile(os.path.join(self.output_dir, 'a', 'part-00000'),
                      b'line-a0\n')
        self.makefile(os.path.join(self.output_dir, 'a', 'part-00001'),
                      b'line-a1\n')

        self.makefile(os.path.join(self.output_dir, 'b', 'part-00000'),
                      b'line-b0\n')

        self.makefile(os.path.join(self.output_dir, 'b', '.crc.part-00000'),
                      b'42\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'line-a0\n', b'line-a1\n', b'line-b0\n'])

    def test_read_all_non_hidden_files(self):
        self.makefile(os.path.join(self.output_dir, 'baz'),
                      b'qux\n')

        self.makefile(os.path.join(self.output_dir, 'foo', 'bar'),
                      b'baz\n')

        self.assertEqual(sorted(to_lines(self.runner.cat_output())),
                         [b'baz\n', b'qux\n'])

    def test_empty_string_between_files(self):
        self.makefile(os.path.join(self.output_dir, 'part-00000'), b'A')
        self.makefile(os.path.join(self.output_dir, 'part-00001'), b'\n')
        self.makefile(os.path.join(self.output_dir, 'part-00002'), b'C')

        # order isn't guaranteed, but there should be 3 chunks separated
        # by two empty strings
        chunks = list(self.runner.cat_output())
        self.assertEqual(len(chunks), 5)
        self.assertEqual(chunks[1], b'')
        self.assertEqual(chunks[3], b'')

    def test_output_dir_not_considered_hidden(self):
        output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir')

        self.makefile(os.path.join(output_dir, 'part-00000'),
                      b'cats\n')

        runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir)

        self.assertEqual(sorted(to_lines(runner.stream_output())),
                         [b'cats\n'])

    def test_deprecated_stream_output(self):
        self.makefile(os.path.join(self.output_dir, 'part-00000'),
                      b'1\n2')
        self.makefile(os.path.join(self.output_dir, 'part-00001'),
                      b'3\n4\n')

        log = self.start(patch('mrjob.runner.log'))

        # should group output into lines, but not join across files
        self.assertEqual(sorted(self.runner.stream_output()),
                         [b'1\n', b'2', b'3\n', b'4\n'])

        # should issue deprecation warning
        self.assertEqual(log.warning.call_count, 1)
示例#52
0
 def test_extra_kwargs_in_mrjob_conf_okay(self):
     with logger_disabled('mrjob.runner'):
         runner = InlineMRJobRunner(conf_paths=[self.path])
         self.assertEqual(runner._opts['setup'], ['echo foo'])
         self.assertNotIn('qux', runner._opts)
示例#53
0
    def test_empty(self):
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'no_script')
        self.assertEqual(match.group(2), getpass.getuser())
示例#54
0
 def setUp(self):
     self.runner = InlineMRJobRunner(conf_paths=[])
示例#55
0
    def test_empty(self):
        runner = InlineMRJobRunner(conf_paths=[])
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "no_script")
        self.assertEqual(match.group(2), getpass.getuser())
示例#56
0
    def test_nonexistent_dir(self):
        runner = InlineMRJobRunner()

        nonexistent_dir = os.path.join(self.tmp_dir, 'nonexistent')

        self.assertRaises(OSError, runner._create_dir_archive, nonexistent_dir)
示例#57
0
    def test_missing_dir(self):
        archive_path = os.path.join(self.tmp_dir, 'archive')

        runner = InlineMRJobRunner()

        self.assertRaises(OSError, runner._dir_archive_path, archive_path)
示例#58
0
    def test_owner_and_label_kwargs(self):
        runner = InlineMRJobRunner(conf_paths=[], owner="ads", label="ads_chain")
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "ads_chain")
        self.assertEqual(match.group(2), "ads")