def test_defaults(self): step = SparkStep(spark=spark_func) self.assertEqual(step.spark, spark_func) self.assertEqual(step.spark_args, []) self.assertEqual( step.description(0), dict(type='spark', jobconf={}, spark_args=[]), )
def test_all_args(self): step = SparkStep(spark=spark_func, spark_args=['argh', 'argh']) self.assertEqual(step.spark, spark_func) self.assertEqual(step.spark_args, ['argh', 'argh']) self.assertEqual( step.description(0), dict(type='spark', spark_args=['argh', 'argh']), )
def test_spark_method(self): j = MRJob(['--no-conf']) j.spark = MagicMock() self.assertEqual(j.steps(), [SparkStep(j.spark)]) self.assertEqual(j._steps_desc(), [dict(type='spark', spark_args=[])])
def test_spark_and_spark_args_methods(self): j = MRJob(['--no-conf']) j.spark = MagicMock() j.spark_args = MagicMock(return_value=['argh', 'ARRRRGH!']) self.assertEqual(j.steps(), [SparkStep(j.spark, spark_args=['argh', 'ARRRRGH!'])]) self.assertEqual(j._steps_desc(), [dict(type='spark', spark_args=['argh', 'ARRRRGH!'])])
def test_spark_with_step_num(self): job = MRJob(['--step-num=1', '--spark', 'input_dir', 'output_dir']) mapper = MagicMock() spark = MagicMock() job.steps = Mock( return_value=[MRStep(mapper=mapper), SparkStep(spark)]) job.execute() spark.assert_called_once_with('input_dir', 'output_dir') self.assertFalse(mapper.called)
def steps(self): """Re-define this to make a multi-step job. If you don't re-define this, we'll automatically create a one-step job using any of :py:meth:`mapper`, :py:meth:`mapper_init`, :py:meth:`mapper_final`, :py:meth:`reducer_init`, :py:meth:`reducer_final`, and :py:meth:`reducer` that you've re-defined. For example:: def steps(self): return [MRStep(mapper=self.transform_input, reducer=self.consolidate_1), MRStep(reducer_init=self.log_mapper_init, reducer=self.consolidate_2)] :return: a list of steps constructed with :py:class:`~mrjob.step.MRStep` or other classes in :py:mod:`mrjob.step`. """ # only include methods that have been redefined kwargs = dict( (func_name, getattr(self, func_name)) for func_name in _JOB_STEP_FUNC_PARAMS + ('spark',) if (_im_func(getattr(self, func_name)) is not _im_func(getattr(MRJob, func_name)))) # special case for spark() # TODO: support jobconf as well if 'spark' in kwargs: if sorted(kwargs) != ['spark']: raise ValueError( "Can't mix spark() and streaming functions") return [SparkStep( spark=kwargs['spark'], spark_args=self.spark_args())] # MRStep takes commands as strings, but the user defines them in the # class as functions that return strings, so call the functions. updates = {} for k, v in kwargs.items(): if k.endswith('_cmd') or k.endswith('_pre_filter'): updates[k] = v() kwargs.update(updates) if kwargs: return [MRStep(**kwargs)] else: return []
def test_positional_spark_arg(self): step1 = SparkStep(spark_func) step2 = SparkStep(spark=spark_func) self.assertEqual(step1, step2) self.assertEqual(step1.description(0), step2.description(0))
def steps(self): return [ MRStep(mapper=self.mapper), SparkStep(self.spark), ]