示例#1
0
文件: job.py 项目: Yelp/mrjob
    def steps(self):
        """Re-define this to make a multi-step job.

        If you don't re-define this, we'll automatically create a one-step
        job using any of :py:meth:`mapper`, :py:meth:`mapper_init`,
        :py:meth:`mapper_final`, :py:meth:`reducer_init`,
        :py:meth:`reducer_final`, and :py:meth:`reducer` that you've
        re-defined. For example::

            def steps(self):
                return [MRStep(mapper=self.transform_input,
                               reducer=self.consolidate_1),
                        MRStep(reducer_init=self.log_mapper_init,
                               reducer=self.consolidate_2)]

        :return: a list of steps constructed with
                 :py:class:`~mrjob.step.MRStep` or other classes in
                 :py:mod:`mrjob.step`.
        """
        # only include methods that have been redefined
        kwargs = dict(
            (func_name, getattr(self, func_name))
            for func_name in _JOB_STEP_FUNC_PARAMS + ('spark',)
            if (_im_func(getattr(self, func_name)) is not
                _im_func(getattr(MRJob, func_name))))

        # special case for spark()
        # TODO: support jobconf as well
        if 'spark' in kwargs:
            if sorted(kwargs) != ['spark']:
                raise ValueError(
                    "Can't mix spark() and streaming functions")
            return [SparkStep(
                spark=kwargs['spark'],
                spark_args=self.spark_args())]

        # MRStep takes commands as strings, but the user defines them in the
        # class as functions that return strings, so call the functions.
        updates = {}
        for k, v in kwargs.items():
            if k.endswith('_cmd') or k.endswith('_pre_filter'):
                updates[k] = v()

        kwargs.update(updates)

        if kwargs:
            return [MRStep(**kwargs)]
        else:
            return []
示例#2
0
    def steps(self):
        """Re-define this to make a multi-step job.

        If you don't re-define this, we'll automatically create a one-step
        job using any of :py:meth:`mapper`, :py:meth:`mapper_init`,
        :py:meth:`mapper_final`, :py:meth:`reducer_init`,
        :py:meth:`reducer_final`, and :py:meth:`reducer` that you've
        re-defined. For example::

            def steps(self):
                return [MRStep(mapper=self.transform_input,
                               reducer=self.consolidate_1),
                        MRStep(reducer_init=self.log_mapper_init,
                               reducer=self.consolidate_2)]

        :return: a list of steps constructed with
                 :py:class:`~mrjob.step.MRStep` or other classes in
                 :py:mod:`mrjob.step`.
        """
        # only include methods that have been redefined
        kwargs = dict(
            (func_name, getattr(self, func_name))
            for func_name in _JOB_STEP_FUNC_PARAMS + ('spark',)
            if (_im_func(getattr(self, func_name)) is not
                _im_func(getattr(MRJob, func_name))))

        # special case for spark()
        # TODO: support jobconf as well
        if 'spark' in kwargs:
            if sorted(kwargs) != ['spark']:
                raise ValueError(
                    "Can't mix spark() and streaming functions")
            return [SparkStep(
                spark=kwargs['spark'],
                spark_args=self.spark_args())]

        # MRStep takes commands as strings, but the user defines them in the
        # class as functions that return strings, so call the functions.
        updates = {}
        for k, v in kwargs.items():
            if k.endswith('_cmd') or k.endswith('_pre_filter'):
                updates[k] = v()

        kwargs.update(updates)

        if kwargs:
            return [MRStep(**kwargs)]
        else:
            return []