Exemplo n.º 1
0
    def _fit_transform_data_container(
            self, data_container: DataContainer,
            context: ExecutionContext) -> ('BaseStep', DataContainer):
        """
        According to the idiom of `(1, 2, reversed(1))`, we do this, in order:

            - `1`. Fit Transform preprocessing step
            - `2`. Fit Transform postprocessing step
            - `reversed(1)`. Inverse transform preprocessing step

        :param data_container: data container to transform
        :param context: execution context
        :return: (self, data_container)
        """
        self["preprocessing_step"], data_container = self[
            "preprocessing_step"].handle_fit_transform(
                data_container, context.push(self["preprocessing_step"]))
        self["postprocessing_step"], data_container = self[
            "postprocessing_step"].handle_fit_transform(
                data_container, context.push(self["postprocessing_step"]))

        data_container = self["preprocessing_step"].handle_inverse_transform(
            data_container, context.push(self["preprocessing_step"]))

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return self, data_container
Exemplo n.º 2
0
    def _fit_transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> (
    'BaseStep', DataContainer):
        """
        Fit Transform given data inputs without splitting.

        :param context:
        :param data_container: DataContainer
        :type data_container: DataContainer
        :type context: ExecutionContext
        :return: outputs
        """
        train_data_container, validation_data_container = self.split_data_container(data_container)

        # add sub data container for the validation metrics calculated in MetricsWrapper
        train_data_container.add_sub_data_container(
            name=VALIDATION_SUB_DATA_CONTAINER_NAME,
            data_container=validation_data_container
        )

        self.wrapped, results_data_container = self.wrapped.handle_fit_transform(train_data_container,
                                                                                 context.push(self.wrapped))

        self._update_scores_train(results_data_container.data_inputs, results_data_container.expected_outputs)

        results_data_container = self.wrapped.handle_predict(validation_data_container, context.push(self.wrapped))

        self._update_scores_validation(results_data_container.data_inputs, results_data_container.expected_outputs)

        self.wrapped.apply('disable_metrics')
        data_container = self.wrapped.handle_predict(data_container, context.push(self.wrapped))
        self.wrapped.apply('enable_metrics')

        return self, data_container
Exemplo n.º 3
0
    def handle_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:
        """
        According to the idiom of `(1, 2, reversed(1))`, we do this, in order:

            - `1`. Transform preprocessing step
            - `2`. Transform postprocessing step
            - `reversed(1)`. Inverse transform preprocessing step

        :param data_container: data container to transform
        :type data_container: DataContainer
        :param context: execution context
        :type context: ExecutionContext
        :return: data_container
        :rtype: DataContainer
        """
        data_container = self["preprocessing_step"].handle_transform(data_container,
                                                                     context.push(self["preprocessing_step"]))
        data_container = self["postprocessing_step"].handle_transform(data_container,
                                                                      context.push(self["postprocessing_step"]))

        data_container = self["preprocessing_step"].handle_inverse_transform(data_container,
                                                                             context.push(self["preprocessing_step"]))

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return data_container
Exemplo n.º 4
0
    def handle_transform(self, data_container: DataContainer,
                         context: ExecutionContext):
        """
        Transform the data with the unions. It will make use of some parallel processing.

        :param data_container: data container
        :param context: execution context
        :return: the transformed data_inputs.
        """
        if self.n_jobs != 1:
            data_containers = Parallel(
                backend=self.backend,
                n_jobs=self.n_jobs)(delayed(step.handle_transform)(
                    data_container.copy(), context.push(step))
                                    for _, step in self.steps_as_tuple)
        else:
            data_containers = [
                step.handle_transform(data_container.copy(),
                                      context.push(step))
                for _, step in self.steps_as_tuple
            ]

        new_current_ids = self.hash(data_container)

        data_container = self.joiner.handle_transform(data_containers,
                                                      new_current_ids)

        return data_container
Exemplo n.º 5
0
    def handle_fit(self, data_container: DataContainer,
                   context: ExecutionContext):
        """
        Fit the parallel steps on the data. It will make use of some parallel processing.

        :param data_container: The input data to fit onto
        :param context: execution context
        :return: self
        """
        # Actually fit:
        if self.n_jobs != 1:
            fitted_steps_data_containers = Parallel(
                backend=self.backend,
                n_jobs=self.n_jobs)(delayed(step.handle_fit)(
                    data_container.copy(), context.push(step))
                                    for _, step in self.steps_as_tuple)
        else:
            fitted_steps_data_containers = [
                step.handle_fit(data_container.copy(), context.push(step))
                for _, step in self.steps_as_tuple
            ]

        # Save fitted steps
        for i, (fitted_step, _) in enumerate(fitted_steps_data_containers):
            self.steps_as_tuple[i] = (self.steps_as_tuple[i][0], fitted_step)
        self._refresh_steps()

        return self, data_container
Exemplo n.º 6
0
    def _fit_transform_data_container(
            self, data_container: DataContainer,
            context: ExecutionContext) -> ('BaseStep', DataContainer):
        """
        Fit Transform given data inputs without splitting.

        :param context:
        :param data_container: DataContainer
        :type data_container: DataContainer
        :type context: ExecutionContext
        :return: outputs
        """
        train_data_container, validation_data_container = self.split_data_container(
            data_container)

        self.wrapped, results_data_container = self.wrapped.handle_fit_transform(
            train_data_container, context.push(self.wrapped))

        self._update_scores_train(results_data_container.data_inputs,
                                  results_data_container.expected_outputs)

        results_data_container = self.wrapped.handle_predict(
            validation_data_container, context.push(self.wrapped))

        self._update_scores_validation(results_data_container.data_inputs,
                                       results_data_container.expected_outputs)

        self.wrapped.apply('disable_metrics')
        data_container = self.wrapped.handle_predict(
            data_container, context.push(self.wrapped))
        self.wrapped.apply('enable_metrics')

        return self, data_container
Exemplo n.º 7
0
    def handle_inverse_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:
        """
        Handle inverse transform by passing expected outputs to the wrapped step inverse transform method.
        Update the expected outputs with the outputs.

        :param context: execution context
        :param data_container:
        :return: data container
        :rtype: DataContainer
        """
        new_expected_outputs_data_container = self.wrapped.handle_inverse_transform(
            DataContainer(
                current_ids=data_container.current_ids,
                data_inputs=data_container.expected_outputs,
                expected_outputs=None
            ),
            context.push(self.wrapped)
        )

        data_container.set_expected_outputs(new_expected_outputs_data_container.data_inputs)

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return data_container
Exemplo n.º 8
0
    def join_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> DataContainer:
        """
        Concatenate the pipeline transform output of each batch of self.batch_size together.

        :param step: pipeline to transform on
        :type step: Pipeline
        :param data_container: data container to transform
        :type data_container: DataContainer
        :param context: execution context
        :return: transformed data container
        :rtype: DataContainer
        """
        context = context.push(step)

        data_container_batches = data_container.convolved_1d(
            stride=self.batch_size,
            kernel_size=self.batch_size
        )

        output_data_container = ListDataContainer.empty()
        for data_container_batch in data_container_batches:
            output_data_container.concat(
                step._transform_data_container(data_container_batch, context)
            )

        return output_data_container
Exemplo n.º 9
0
    def fit_data_container(self, data_container):
        data_container = self.hash_data_container(data_container)
        context = ExecutionContext(self.cache_folder, ExecutionMode.FIT)
        context = context.push(self)
        new_self = self._fit_data_container(data_container, context)

        return new_self
Exemplo n.º 10
0
    def join_transform(self, step: Pipeline, data_container: DataContainer,
                       context: ExecutionContext) -> DataContainer:
        """
        Concatenate the pipeline transform output of each batch of self.batch_size together.
        :param step: pipeline to transform on
        :type step: Pipeline
        :param data_container: data container to transform
        :type data_container: DataContainer
        :param context: execution context
        :return: transformed data container
        :rtype: DataContainer
        """
        context = context.push(step)
        data_container_batches = data_container.minibatches(
            batch_size=self.batch_size,
            include_incomplete_batch=self.include_incomplete_batch,
            default_value_data_inputs=self.default_value_data_inputs,
            default_value_expected_outputs=self.default_value_expected_outputs)

        output_data_container = ListDataContainer.empty()
        for data_container_batch in data_container_batches:
            output_data_container.concat(
                step._transform_data_container(data_container_batch, context))

        return output_data_container
Exemplo n.º 11
0
    def join_fit_transform(
            self, step: Pipeline, data_container: DataContainer,
            context: ExecutionContext) -> Tuple['Any', DataContainer]:
        """
        Concatenate the pipeline fit transform output of each batch of self.batch_size together.
        :param step: pipeline to fit transform on
        :type step: Pipeline
        :param data_container: data container to fit transform on
        :type data_container: DataContainer
        :param context: execution context
        :return: fitted self, transformed data inputs
        :rtype: Tuple[Any, DataContainer]
        """
        context = context.push(step)

        data_container_batches = data_container.convolved_1d(
            stride=self.batch_size, kernel_size=self.batch_size)

        output_data_container = ListDataContainer.empty()
        for data_container_batch in data_container_batches:
            step, data_container_batch = step._fit_transform_data_container(
                data_container_batch, context)
            output_data_container.concat(data_container_batch)

        return step, output_data_container
Exemplo n.º 12
0
    def handle_fit_transform(self, data_container: DataContainer, context: ExecutionContext) -> \
            Tuple['MiniBatchSequentialPipeline', DataContainer]:
        """
        Transform all sub pipelines splitted by the Barrier steps.

        :param data_container: data container to transform.
        :param context: execution context
        :return: data container
        """
        sub_pipelines = self._create_sub_pipelines()
        index_start = 0

        for sub_pipeline in sub_pipelines:
            sub_context = context.push(sub_pipeline)
            sub_pipeline.setup()

            barrier = sub_pipeline[-1]
            sub_pipeline, data_container = barrier.join_fit_transform(
                step=sub_pipeline,
                data_container=data_container,
                context=sub_context)
            current_ids = self.hash(data_container)
            data_container.set_current_ids(current_ids)

            new_self = self[:index_start] + sub_pipeline
            if index_start + len(sub_pipeline) < len(self):
                new_self += self[index_start + len(sub_pipeline):]

            self.steps_as_tuple = new_self.steps_as_tuple
            index_start += len(sub_pipeline)

        return self, data_container
Exemplo n.º 13
0
    def _load_checkpoint(
            self, data_container: DataContainer,
            context: ExecutionContext) -> Tuple[NamedTupleList, DataContainer]:
        """
        Try loading a pipeline cache with the passed data container.
        If pipeline cache loading succeeds, find steps left to do,
        and load the latest data container.

        :param data_container: the data container to resume
        :param context: the execution context to resume
        :return: tuple(steps left to do, last checkpoint data container)
        """
        new_starting_step_index, starting_step_data_container = \
            self._get_starting_step_info(data_container, context)

        loaded_pipeline = self.load(context)
        if not self.are_steps_before_index_the_same(loaded_pipeline,
                                                    new_starting_step_index):
            return self.steps_as_tuple, data_container

        self._assign_loaded_pipeline_into_self(loaded_pipeline)

        step = self[new_starting_step_index]
        if isinstance(step, Checkpoint):
            context = context.push(step)
            starting_step_data_container = step.read_checkpoint(
                starting_step_data_container, context)

        return self[new_starting_step_index:], starting_step_data_container
Exemplo n.º 14
0
    def _fit_transform_core(
            self, data_container: DataContainer,
            context: ExecutionContext) -> ('Pipeline', DataContainer):
        """
        After loading the last checkpoint, fit transform each pipeline steps

        :param data_container: the data container to fit transform on
        :param context: execution context
        :return: tuple(pipeline, data_container)
        """
        steps_left_to_do, data_container = self._load_checkpoint(
            data_container, context)
        self.setup()

        new_steps_as_tuple: NamedTupleList = []

        for step_name, step in steps_left_to_do:
            step.setup()
            sub_step_context = context.push(step)

            step, data_container = step.handle_fit_transform(
                data_container, sub_step_context)

            new_steps_as_tuple.append((step_name, step))

        self.steps_as_tuple = self.steps_as_tuple[:len(self.steps_as_tuple) - len(steps_left_to_do)] + \
                              new_steps_as_tuple

        return self, data_container
Exemplo n.º 15
0
    def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \
            Tuple['Any', DataContainer]:
        """
        Concatenate the pipeline fit transform output of each batch of self.batch_size together.
        :param step: pipeline to fit transform on
        :type step: Pipeline
        :param data_container: data container to fit transform on
        :type data_container: DataContainer
        :param context: execution context
        :return: fitted self, transformed data inputs
        :rtype: Tuple[Any, DataContainer]
        """
        context = context.push(step)
        data_container_batches = data_container.minibatches(
            batch_size=self.batch_size,
            include_incomplete_batch=self.include_incomplete_batch,
            default_value_data_inputs=self.default_value_data_inputs,
            default_value_expected_outputs=self.default_value_expected_outputs)

        output_data_container = ListDataContainer.empty()
        for data_container_batch in data_container_batches:
            step, data_container_batch = step._fit_transform_data_container(
                data_container_batch, context)
            output_data_container.concat(data_container_batch)

        return step, output_data_container
Exemplo n.º 16
0
    def should_resume(self, data_container: DataContainer, context: ExecutionContext) -> bool:
        context = context.push(self)
        expanded_data_container = ExpandedDataContainer.create_from(data_container)

        if isinstance(self.wrapped, ResumableStepMixin) and \
                self.wrapped.should_resume(expanded_data_container, context):
            return True

        return False
Exemplo n.º 17
0
    def fit_transform_data_container(self, data_container):
        data_container = self.hash_data_container(data_container)
        context = ExecutionContext(root=self.cache_folder,
                                   execution_mode=ExecutionMode.FIT_TRANSFORM)
        context = context.push(self)
        new_self, data_container = self._fit_transform_data_container(
            data_container, context)

        return new_self, data_container.data_inputs
Exemplo n.º 18
0
    def should_resume(self, data_container: DataContainer,
                      context: ExecutionContext):
        context = context.push(self)

        if isinstance(self.wrapped,
                      ResumableStepMixin) and self.wrapped.should_resume(
                          data_container, context):
            return True
        return False
Exemplo n.º 19
0
    def transform_data_container(self, data_container: DataContainer):
        data_container = self.hash_data_container(data_container)
        context = ExecutionContext(root=self.cache_folder,
                                   execution_mode=ExecutionMode.TRANSFORM)
        context = context.push(self)
        data_container = self._transform_data_container(
            data_container, context)

        return data_container.data_inputs
Exemplo n.º 20
0
 def handle_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:
     """
     :param data_container: data container
     :type data_container: DataContainer
     :param context: execution context
     :type context: ExecutionContext
     :return: step, data_container
     :type: DataContainer
     """
     self.transform_callback_function(data_container.data_inputs, *self.more_arguments)
     return self.wrapped.handle_transform(data_container, context.push(self.wrapped))
Exemplo n.º 21
0
    def handle_fit(self, data_container: DataContainer,
                   context: ExecutionContext) -> (BaseStep, DataContainer):
        self.wrapped = self.wrapped.handle_fit(
            DataContainer(current_ids=data_container.current_ids,
                          data_inputs=data_container.expected_outputs,
                          expected_outputs=None), context.push(self.wrapped))

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return self, data_container
Exemplo n.º 22
0
    def _transform_data_container(self, data_container: DataContainer, context: ExecutionContext):
        """
        Transform given data inputs without splitting.

        :param context: execution context
        :param data_container: DataContainer
        :type data_container: DataContainer
        :type context: ExecutionContext
        :return: outputs
        """
        return self.wrapped.handle_transform(data_container, context.push(self.wrapped))
Exemplo n.º 23
0
    def handle_fit(self, data_container: DataContainer, context: ExecutionContext) -> 'ReversiblePreprocessingWrapper':
        """
        Handle fit by fitting preprocessing step, and postprocessing step.

        :param data_container: data container to fit on
        :type data_container: DataContainer
        :param context: execution context
        :type context: ExecutionContext
        :return: self, data_container
        :rtype: (ReversiblePreprocessingWrapper, DataContainer)
        """
        self["preprocessing_step"], data_container = \
            self["preprocessing_step"].handle_fit_transform(data_container, context.push(self["preprocessing_step"]))
        self["postprocessing_step"] = \
            self["postprocessing_step"].handle_fit(data_container, context.push(self["postprocessing_step"]))

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return self
Exemplo n.º 24
0
    def _fit_transform_data_container(
            self, data_container: DataContainer,
            context: ExecutionContext) -> ('BaseStep', DataContainer):
        """
        Fit Transform given data inputs without splitting.

        :param context:
        :param data_container: DataContainer
        :type data_container: DataContainer
        :type context: ExecutionContext
        :return: outputs
        """
        train_data_container, validation_data_container = self.split_data_container(
            data_container)

        self.wrapped, _ = self.wrapped.handle_fit(train_data_container,
                                                  context.push(self.wrapped))

        results_data_container = self.wrapped.handle_transform(
            train_data_container, context.push(self.wrapped))

        self._update_scores_train(results_data_container.data_inputs,
                                  results_data_container.expected_outputs)

        if self.run_validation_split_in_test_mode:
            self.set_train(False)

        results_data_container = self.wrapped.handle_transform(
            validation_data_container, context.push(self.wrapped))

        self.set_train(True)

        self._update_scores_validation(results_data_container.data_inputs,
                                       results_data_container.expected_outputs)

        data_container = self.wrapped.handle_transform(
            data_container, context.push(self.wrapped))

        return self, data_container
Exemplo n.º 25
0
    def handle_transform(self, data_container: DataContainer,
                         context: ExecutionContext) -> DataContainer:
        new_expected_outputs_data_container = self.wrapped.handle_transform(
            DataContainer(current_ids=data_container.current_ids,
                          data_inputs=data_container.expected_outputs,
                          expected_outputs=None), context.push(self.wrapped))

        data_container.set_expected_outputs(
            new_expected_outputs_data_container.data_inputs)

        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return data_container
Exemplo n.º 26
0
    def should_resume(self, data_container: DataContainer, context: ExecutionContext) -> bool:
        """
        Return True if the pipeline has a saved checkpoint that it can resume from

        :param context: execution context
        :param data_container: the data container to resume
        :return: bool
        """
        context = context.push(self)
        for index, (step_name, step) in enumerate(reversed(self.items())):
            if hasattr(step, 'should_resume') and step.should_resume(data_container, context):
                return True

        return False
Exemplo n.º 27
0
    def handle_inverse_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:
        """
        :param context: execution context
        :type context: ExecutionContext
        :param data_container: data containerj
        :type data_container: DataContainer
        :return: data container
        :rtype: DataContainer
        """
        self.inverse_transform_callback_function(data_container.data_inputs, *self.more_arguments)
        data_container = self.wrapped.handle_inverse_transform(data_container, context.push(self.wrapped))
        current_ids = self.hash(data_container)
        data_container.set_current_ids(current_ids)

        return data_container
Exemplo n.º 28
0
    def transform(self, data_inputs: Any):
        """
        After loading the last checkpoint, transform each pipeline steps

        :param data_inputs: the data input to transform
        :return: transformed data inputs
        """
        data_container = DataContainer(current_ids=None, data_inputs=data_inputs)

        data_container = self.hash_data_container(data_container)
        context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.TRANSFORM)
        context = context.push(self)
        data_container = self._transform_data_container(data_container, context)

        return data_container.data_inputs
Exemplo n.º 29
0
    def join_transform(self, step: TruncableSteps, data_container: DataContainer,
                       context: ExecutionContext) -> ZipDataContainer:
        context = context.push(step)
        data_container_batches = data_container.minibatches(
            batch_size=self.batch_size,
            keep_incomplete_batch=self.keep_incomplete_batch,
            default_value_data_inputs=self.default_value_data_inputs,
            default_value_expected_outputs=self.default_value_expected_outputs
        )

        output_data_container = []
        for data_container_batch in data_container_batches:
            output_data_container.append(step._transform_data_container(data_container_batch, context))

        return ZipDataContainer.create_from(*output_data_container)
Exemplo n.º 30
0
    def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \
            Tuple['Any', DataContainer]:
        context = context.push(step)
        data_container_batches = data_container.minibatches(
            batch_size=self.batch_size,
            keep_incomplete_batch=self.keep_incomplete_batch,
            default_value_data_inputs=self.default_value_data_inputs,
            default_value_expected_outputs=self.default_value_expected_outputs
        )

        output_data_container = []
        for data_container_batch in data_container_batches:
            step, data_container_batch = step._fit_transform_data_container(data_container_batch, context)
            output_data_container.append(data_container_batch)

        return step, ZipDataContainer.create_from(*output_data_container)