示例#1
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
            # Check that main module containing the task exists.
            run_path = os.path.dirname(task.app_command()[1])
            self.assertTrue(
                os.path.exists(
                    os.path.join(run_path, os.path.basename(__file__))))
            # Check that the python path contains the run_path
            self.assertTrue(run_path in sys.path)
            # Check if find_class finds the class for the correct module name.
            with open(task.app_command()[1], 'rb') as fp:
                self.assertTrue(
                    pickle.Unpickler(fp).find_class('spark_test',
                                                    'TestPySparkTask'))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
示例#2
0
 def run(self, result=None):
     conf = {
         'hdfs': {
             'client': 'webhdfs'
         },
         'webhdfs': {
             'port': str(self.cluster.webhdfs_port)
         },
     }
     with_config(conf)(super(WebHdfsTargetTest, self).run)(result)
示例#3
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
示例#4
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
示例#5
0
    def test_pyspark_session_runner_use_spark_session_true_spark1(self):
        pyspark = MagicMock()
        pyspark.__version__ = '1.6.3'
        pyspark_sql = MagicMock()
        with patch.dict(sys.modules, {
                'pyspark': pyspark,
                'pyspark.sql': pyspark_sql
        }):

            def mock_spark_submit(task):
                from luigi.contrib.pyspark_runner import PySparkSessionRunner
                self.assertRaises(
                    RuntimeError,
                    PySparkSessionRunner(*task.app_command()[1:]).run)

            with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
                job = TestPySparkSessionTask()
                with temporary_unloaded_module(b'') as task_module:
                    with_config({'spark': {
                        'py-packages': task_module
                    }})(job.run)()
示例#6
0
    def test_pyspark_session_runner_use_spark_session_true(self):
        pyspark = MagicMock()
        pyspark.__version__ = '2.1.0'
        pyspark_sql = MagicMock()
        with patch.dict(sys.modules, {
                'pyspark': pyspark,
                'pyspark.sql': pyspark_sql
        }):
            spark = pyspark_sql.SparkSession.builder.config.return_value.enableHiveSupport.return_value.getOrCreate.return_value
            sc = spark.sparkContext

            def mock_spark_submit(task):
                from luigi.contrib.pyspark_runner import PySparkSessionRunner
                PySparkSessionRunner(*task.app_command()[1:]).run()
                # Check py-package exists
                self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
                # Check that main module containing the task exists.
                run_path = os.path.dirname(task.app_command()[1])
                self.assertTrue(
                    os.path.exists(
                        os.path.join(run_path, os.path.basename(__file__))))
                # Check that the python path contains the run_path
                self.assertTrue(run_path in sys.path)
                # Check if find_class finds the class for the correct module name.
                with open(task.app_command()[1], 'rb') as fp:
                    self.assertTrue(
                        pickle.Unpickler(fp).find_class(
                            'spark_test', 'TestPySparkSessionTask'))

            with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
                job = TestPySparkSessionTask()
                with temporary_unloaded_module(b'') as task_module:
                    with_config({'spark': {
                        'py-packages': task_module
                    }})(job.run)()

            spark.sql.assert_called_with('input')
            spark.sql.return_value.write.saveAsTable.assert_called_with(
                'output')
            spark.stop.assert_called_once_with()
示例#7
0
文件: spark_test.py 项目: Houzz/luigi
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
            # Check that main module containing the task exists.
            run_path = os.path.dirname(task.app_command()[1])
            self.assertTrue(os.path.exists(os.path.join(run_path, os.path.basename(__file__))))
            # Check that the python path contains the run_path
            self.assertTrue(run_path in sys.path)
            # Check if find_class finds the class for the correct module name.
            with open(task.app_command()[1], 'rb') as fp:
                self.assertTrue(pickle.Unpickler(fp).find_class('spark_test', 'TestPySparkTask'))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
示例#8
0
 def run_with_config(self, retcode_config, *args, **kwargs):
     with_config(dict(retcode=retcode_config))(self.run_and_expect)(*args, **kwargs)
示例#9
0
 def run(self, result=None):
     conf = {'hdfs': {'client': 'webhdfs'},
             'webhdfs': {'port': str(self.cluster.webhdfs_port)},
             }
     with_config(conf)(super(WebHdfsTargetTest, self).run)(result)
示例#10
0
 def run_with_config(self, retcode_config, *args, **kwargs):
     with_config(dict(retcode=retcode_config))(self.run_and_expect)(
         *args, **kwargs)