Пример #1
0
    def test_spark_master_yarn(self):
        runner = SparkMRJobRunner(spark_master='yarn')

        self.assertTrue(is_uri(runner._spark_tmp_dir))
        self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')

        self.assertIsNotNone(runner._upload_mgr)
Пример #2
0
    def test_spark_master_mesos(self):
        runner = SparkMRJobRunner(spark_master='mesos://host:12345')

        self.assertTrue(is_uri(runner._spark_tmp_dir))
        self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')

        self.assertIsNotNone(runner._upload_mgr)
Пример #3
0
    def test_default(self):
        runner = SparkMRJobRunner()

        self.assertFalse(is_uri(runner._spark_tmp_dir))
        self.assertIsNone(runner._upload_mgr)

        self.assertEqual(runner._spark_tmp_dir[-6:], '-spark')
Пример #4
0
    def test_explicit_spark_tmp_dir_uri(self):
        runner = SparkMRJobRunner(spark_master='mesos://host:12345',
                                  spark_tmp_dir='s3://walrus/tmp')

        self.assertTrue(runner._spark_tmp_dir.startswith('s3://walrus/tmp/'))
        self.assertGreater(len(runner._spark_tmp_dir), len('s3://walrus/tmp/'))

        self.assertIsNotNone(runner._upload_mgr)
Пример #5
0
    def test_explicit_spark_tmp_dir_path(self):
        # posixpath.join() and os.path.join() are the same on UNIX
        self.start(patch('os.path.join', lambda *paths: '/./'.join(paths)))

        runner = SparkMRJobRunner(spark_tmp_dir='/path/to/tmp')

        self.assertTrue(runner._spark_tmp_dir.startswith('/path/to/tmp/./'))
        self.assertGreater(len(runner._spark_tmp_dir), len('/path/to/tmp/./'))

        self.assertIsNone(runner._upload_mgr)
Пример #6
0
    def test_ignore_format_and_sort_kwargs(self):
        # hadoop formats and SORT_VALUES are read directly from the job,
        # so the runner's constructor ignores the corresponding kwargs
        #
        # see #2022

        # same set up as test_sort_values(), above
        runner = SparkMRJobRunner(
            mr_job_script=MRSortAndGroup.mr_job_script(),
            mrjob_cls=MRSortAndGroup,
            stdin=BytesIO(
                b'alligator\nactuary\nbowling\nartichoke\nballoon\nbaby\n'),
            hadoop_input_format='TerribleInputFormat',
            hadoop_output_format='AwfulOutputFormat',
            sort_values=False)

        runner.run()

        self.assertEqual(
            dict(MRSortAndGroup().parse_output(runner.cat_output())),
            dict(a=['actuary', 'alligator', 'artichoke'],
                 b=['baby', 'balloon', 'bowling']))
Пример #7
0
    def test_spark_master_local(self):
        runner = SparkMRJobRunner(spark_master='local[*]')

        self.assertFalse(is_uri(runner._spark_tmp_dir))
        self.assertIsNone(runner._upload_mgr)
Пример #8
0
    def test_local_uri_with_non_local_runner(self):
        SparkMRJobRunner(spark_tmp_dir='/tmp',
                         spark_master='mesos://host:12345')

        self.assertTrue(self.log.warning.called)
Пример #9
0
    def test_non_local_uri_with_local_runner(self):
        SparkMRJobRunner(spark_tmp_dir='s3://walrus/tmp')

        self.assertTrue(self.log.warning.called)