예제 #1
0
파일: base.py 프로젝트: ysheng312/polyaxon
    def _call_graph_fn(self, inputs):
        """Calls graph function.

        Creates first one or two graph, i.e. train and target graphs.
        Return the optimal action given an exploration policy.

        If `is_dueling` is set to `True`,
        then another layer is added that represents the state value.

        Args:
            inputs: `Tensor` or `dict` of tensors
        """
        graph_fn = self._build_graph_fn()

        if self.use_target_graph:
            # We create 2 graphs: a training graph and a target graph,
            # so that we can copy one graph to another given a frequency.
            self._train_graph = FunctionModule(mode=self.mode,
                                               build_fn=graph_fn,
                                               name='train')
            self._train_results = self._train_graph(inputs=inputs)
            self._target_graph = FunctionModule(mode=self.mode,
                                                build_fn=graph_fn,
                                                name='target')
            self._target_results = self._target_graph(inputs=inputs)
            return self._build_actions()
        else:
            self._train_results = graph_fn(mode=self.mode, inputs=inputs)
            self._target_results = self._train_results
            return self._build_actions()
예제 #2
0
    def _call_graph_fn(self, features, labels=None):
        """Calls graph function.

        Creates first one or two graph, i.e. train and target graphs.
        Return the optimal action given an exploration policy.

        If `is_dueling` is set to `True`,
        then another layer is added that represents the state value.

        Args:
            features: `Tensor` or `dict` of tensors
            labels: `Tensor` or `dict` of tensors
        """
        graph_fn = self._build_graph_fn()

        if self.use_target_graph:
            # We create 2 graphs: a training graph and a target graph,
            # so that we can copy one graph to another given a frequency.
            self._train_graph = FunctionModule(
                mode=self.mode, build_fn=graph_fn, name='train')
            self._train_results = self._train_graph(features=features, labels=labels)
            self._target_graph = FunctionModule(
                mode=self.mode, build_fn=graph_fn, name='target')
            self._target_results = self._target_graph(features=features, labels=labels)
            return self._build_actions()
        else:
            self._train_results = graph_fn(mode=self.mode, features=features, labels=labels)
            self._target_results = self._train_results
            return self._build_actions()
예제 #3
0
class BaseQModel(BaseRLModel):
    """Base reinforcement learning model class.

    Args:
        mode: `str`, Specifies if this training, evaluation or prediction. See `Modes`.
        graph_fn: Graph function. Follows the signature:
            * Args:
                * `mode`: Specifies if this training, evaluation or prediction. See `Modes`.
                * `inputs`: the feature inputs.
        loss: An instance of `LossConfig`.
        num_states: `int`. The number of states.
        num_actions: `int`. The number of actions.
        optimizer: An instance of `OptimizerConfig`. Default value `Adam`.
        metrics: a list of `MetricConfig` instances.
        discount: `float`. The discount factor on the target Q values.
        exploration_config: An instance `ExplorationConfig`
        use_target_graph: `bool`. To use a second “target” network,
            which we will use to compute target Q values during our updates.
        target_update_frequency: `int`. At which frequency to update the target graph.
            Only used when `use_target_graph` is set tot True.
        is_continuous: `bool`. Is the model built for a continuous or discrete space.
        dueling: `str` or `bool`. To compute separately the advantage and value functions.
            Options:
                * `True`: creates advantage and state value without any further computation.
                * `mean`, `max`, and `naive`: creates advantage and state value, and computes
                  Q = V(s) + A(s, a)
                  where A = A - mean(A) or A = A - max(A) or A = A.
        use_expert_demo: Whether to pretrain the model on a human/expert data.
        summaries: `str` or `list`. The verbosity of the tensorboard visualization.
            Possible values: `all`, `activations`, `loss`, `learning_rate`, `variables`, `gradients`
        clip_gradients: `float`. Gradients  clipping by global norm.
        clip_embed_gradients: `float`. Embedding gradients clipping to a specified value.
        name: `str`, the name of this model, everything will be encapsulated inside this scope.

     Returns:
        `EstimatorSpec`
    """
    def __init__(self,
                 mode,
                 graph_fn,
                 num_states,
                 num_actions,
                 loss=None,
                 optimizer=None,
                 metrics=None,
                 discount=0.97,
                 exploration_config=None,
                 use_target_graph=True,
                 target_update_frequency=5,
                 is_continuous=False,
                 dueling='mean',
                 use_expert_demo=False,
                 summaries='all',
                 clip_gradients=0.5,
                 clip_embed_gradients=0.1,
                 name="Model"):
        self.num_states = num_states
        self.num_actions = num_actions
        self.exploration_config = exploration_config
        self.discount = discount
        self.use_target_graph = use_target_graph
        self.target_update_frequency = target_update_frequency
        self.is_continuous = is_continuous
        self.dueling = dueling
        self.use_expert_demo = use_expert_demo
        loss = loss or HuberLossConfig()

        super(BaseQModel,
              self).__init__(mode=mode,
                             name=name,
                             model_type=self.Types.RL,
                             graph_fn=graph_fn,
                             loss=loss,
                             optimizer=optimizer,
                             metrics=metrics,
                             summaries=summaries,
                             clip_gradients=clip_gradients,
                             clip_embed_gradients=clip_embed_gradients)

        self._train_graph = None
        self._target_graph = None

    def _build_exploration(self):
        """Creates the exploration op.

        TODO: Think about whether we should pass the episode number here or internally by
        changing the optimize_loss function????
        """
        if self.exploration_config:
            return getters.get_exploration(self.exploration_config.IDENTIFIER,
                                           **self.exploration_config.to_dict())
        return None

    def _build_actions(self):
        """Create the chosen action with an exploration policy.

        If inference mode is used the, actions are chosen directly without exploration.
        """
        batch_size = get_tensor_batch_size(self._train_results.q)
        exploration = self._build_exploration()

        if self.is_continuous:
            if exploration is None or Modes.is_infer(self.mode):
                return self._train_results.q

            # use exploration
            return self._train_results.q + exploration
        else:
            self._index_action = tf.argmax(self._train_results.q, axis=1)
            if exploration is None or Modes.is_infer(self.mode):
                return self._index_action

            # use exploration
            exploration_size = tf.concat(axis=0, values=[
                batch_size,
            ])
            should_explore = tf.random_uniform((), 0, 1) < exploration
            random_actions = tf.random_uniform(exploration_size, 0,
                                               self.num_actions, tf.int64)
            return tf.cond(should_explore, lambda: random_actions,
                           lambda: self._index_action)

    def _build_graph_fn(self):
        """Create the new graph_fn based on the one specified by the user.

        The structure of the graph is the following:
            1 - call the graph specified by the user.
            2 - create the advantage action probabilities, and the state value.
            3 - return the the probabilities, if a dueling method is specified,
                calculate the new probabilities.
        Returns:
            `function`. The graph function. The graph function must return a QModelSpec.
        """
        def graph_fn(mode, features, labels=None):
            kwargs = {}
            if 'labels' in get_arguments(self._graph_fn):
                kwargs['labels'] = labels

            graph_outputs = self._graph_fn(mode=mode,
                                           features=features,
                                           **kwargs)
            a = Dense(units=self.num_actions)(graph_outputs)
            v = None

            if self.dueling is not None:
                # Q = V(s) + A(s, a)
                v = Dense(units=1)(graph_outputs)
                if self.dueling == 'mean':
                    q = v + (a - tf.reduce_mean(a, axis=1, keep_dims=True))
                elif self.dueling == 'max':
                    q = v + (a - tf.reduce_max(a, axis=1, keep_dims=True))
                elif self.dueling == 'naive':
                    q = v + a
                elif self.dueling is True:
                    q = tf.identity(a)
                else:
                    raise ValueError("The value `{}` provided for "
                                     "dueling is unsupported.".format(
                                         self.dueling))
            else:
                q = tf.identity(a)

            return QModelSpec(graph_outputs=graph_outputs, a=a, v=v, q=q)

        return graph_fn

    def _call_graph_fn(self, features, labels=None):
        """Calls graph function.

        Creates first one or two graph, i.e. train and target graphs.
        Return the optimal action given an exploration policy.

        If `is_dueling` is set to `True`,
        then another layer is added that represents the state value.

        Args:
            features: `Tensor` or `dict` of tensors
            labels: `Tensor` or `dict` of tensors
        """
        set_learning_phase(Modes.is_train(self.mode))

        graph_fn = self._build_graph_fn()

        if self.use_target_graph:
            # We create 2 graphs: a training graph and a target graph,
            # so that we can copy one graph to another given a frequency.
            self._train_graph = FunctionModule(mode=self.mode,
                                               build_fn=graph_fn,
                                               name='train')
            self._train_results = self._train_graph(features=features,
                                                    labels=labels)
            self._target_graph = FunctionModule(mode=self.mode,
                                                build_fn=graph_fn,
                                                name='target')
            self._target_results = self._target_graph(features=features,
                                                      labels=labels)
            return self._build_actions()
        else:
            self._train_results = graph_fn(mode=self.mode,
                                           features=features,
                                           labels=labels)
            self._target_results = self._train_results
            return self._build_actions()

    def _build_update_target_graph(self):
        """Creates a copy operation from train graph to target graph."""
        if self.use_target_graph:
            return self._train_graph.copy_from(self._train_graph)
        else:
            return None

    def _build_train_op(self, loss):
        """Creates the training operation,

        In case of use_target_network == True, we append also the update op
        while taking into account the update_frequency.
        """
        train_op = super(BaseQModel, self)._build_train_op(loss)

        # check if we need to update the target graph
        if self.use_target_graph:
            update_op = tf.cond(
                tf.equal(
                    tf.mod(training.get_global_step(),
                           self.target_update_frequency),
                    0), self._build_update_target_graph,
                lambda: tf.no_op(name='no_op_copy_target'))

            # append the target update op to the train op.
            train_op = tf.group(*[train_op, update_op],
                                name='train_and_update_target')

        return train_op

    def _build_predictions(self, results, features, labels):
        """Creates the dictionary of predictions that is returned by the model."""
        predictions = super(BaseQModel,
                            self)._build_predictions(results=results,
                                                     features=features,
                                                     labels=labels)
        # Add The QModelSpec values to predictions
        predictions['graph_results'] = self._train_results.graph_outputs
        predictions['a'] = self._train_results.a
        predictions['q'] = self._train_results.q
        if self._train_results.v is not None:
            predictions['v'] = self._train_results.v
        return predictions
예제 #4
0
class BaseQModel(BaseRLModel):
    """Base reinforcement learning model class.

    Args:
        mode: `str`, Specifies if this training, evaluation or prediction. See `Modes`.
        graph_fn: Graph function. Follows the signature:
            * Args:
                * `mode`: Specifies if this training, evaluation or prediction. See `Modes`.
                * `inputs`: the feature inputs.
        loss_config: An instance of `LossConfig`.
        num_states: `int`. The number of states.
        num_actions: `int`. The number of actions.
        optimizer_config: An instance of `OptimizerConfig`. Default value `Adam`.
        eval_metrics_config: a list of `MetricConfig` instances.
        discount: `float`. The discount factor on the target Q values.
        exploration_config: An instance `ExplorationConfig`
        use_target_graph: `bool`. To use a second “target” network,
            which we will use to compute target Q values during our updates.
        target_update_frequency: `int`. At which frequency to update the target graph.
            Only used when `use_target_graph` is set tot True.
        is_continuous: `bool`. Is the model built for a continuous or discrete space.
        dueling: `str` or `bool`. To compute separately the advantage and value functions.
            Options:
                * `True`: creates advantage and state value without any further computation.
                * `mean`, `max`, and `naive`: creates advantage and state value, and computes
                  Q = V(s) + A(s, a)
                  where A = A - mean(A) or A = A - max(A) or A = A.
        use_expert_demo: Whether to pretrain the model on a human/expert data.
        summaries: `str` or `list`. The verbosity of the tensorboard visualization.
            Possible values: `all`, `activations`, `loss`, `learning_rate`, `variables`, `gradients`
        clip_gradients: `float`. Gradients  clipping by global norm.
        clip_embed_gradients: `float`. Embedding gradients clipping to a specified value.
        name: `str`, the name of this model, everything will be encapsulated inside this scope.

     Returns:
        `EstimatorSpec`
    """

    def __init__(self, mode, graph_fn, num_states, num_actions, loss_config=None,
                 optimizer_config=None, eval_metrics_config=None, discount=0.97,
                 exploration_config=None, use_target_graph=True, target_update_frequency=5,
                 is_continuous=False, dueling='mean', use_expert_demo=False, summaries='all',
                 clip_gradients=0.5, clip_embed_gradients=0.1, name="Model"):
        self.num_states = num_states
        self.num_actions = num_actions
        self.exploration_config = exploration_config
        self.discount = discount
        self.use_target_graph = use_target_graph
        self.target_update_frequency = target_update_frequency
        self.is_continuous = is_continuous
        self.dueling = dueling
        self.use_expert_demo = use_expert_demo
        loss_config = loss_config or LossConfig(module='huber_loss')

        super(BaseQModel, self).__init__(
            mode=mode, name=name, model_type=self.Types.RL, graph_fn=graph_fn,
            loss_config=loss_config, optimizer_config=optimizer_config,
            eval_metrics_config=eval_metrics_config, summaries=summaries,
            clip_gradients=clip_gradients, clip_embed_gradients=clip_embed_gradients)

        self._train_graph = None
        self._target_graph = None

    def _build_exploration(self):
        """Creates the exploration op.

        TODO: Think about whether we should pass the episode number here or internally by
        changing the optimize_loss function????
        """
        if self.exploration_config:
            return getters.get_exploration(self.exploration_config.module,
                                           **self.exploration_config.params)
        return None

    def _build_actions(self):
        """Create the chosen action with an exploration policy.

        If inference mode is used the, actions are chosen directly without exploration.
        """
        batch_size = get_tensor_batch_size(self._train_results.q)
        exploration = self._build_exploration()

        if self.is_continuous:
            if exploration is None or Modes.is_infer(self.mode):
                return self._train_results.q

            # use exploration
            return self._train_results.q + exploration
        else:
            self._index_action = tf.argmax(self._train_results.q, axis=1)
            if exploration is None or Modes.is_infer(self.mode):
                return self._index_action

            # use exploration
            exploration_size = tf.concat(axis=0, values=[batch_size, ])
            should_explore = tf.random_uniform((), 0, 1) < exploration
            random_actions = tf.random_uniform(exploration_size, 0, self.num_actions, tf.int64)
            return tf.cond(should_explore, lambda: random_actions, lambda: self._index_action)

    def _build_graph_fn(self):
        """Create the new graph_fn based on the one specified by the user.

        The structure of the graph is the following:
            1 - call the graph specified by the user.
            2 - create the advantage action probabilities, and the state value.
            3 - return the the probabilities, if a dueling method is specified,
                calculate the new probabilities.
        Returns:
            `function`. The graph function. The graph function must return a QModelSpec.
        """
        def graph_fn(mode, features, labels=None):
            kwargs = {}
            if 'labels' in get_arguments(self._graph_fn):
                kwargs['labels'] = labels

            graph_outputs = self._graph_fn(mode=mode, features=features, **kwargs)
            a = FullyConnected(mode, num_units=self.num_actions)(graph_outputs)
            v = None

            if self.dueling is not None:
                # Q = V(s) + A(s, a)
                v = FullyConnected(mode, num_units=1)(graph_outputs)
                if self.dueling == 'mean':
                    q = v + (a - tf.reduce_mean(a, axis=1, keep_dims=True))
                elif self.dueling == 'max':
                    q = v + (a - tf.reduce_max(a, axis=1, keep_dims=True))
                elif self.dueling == 'naive':
                    q = v + a
                elif self.dueling is True:
                    q = tf.identity(a)
                else:
                    raise ValueError("The value `{}` provided for "
                                     "dueling is unsupported.".format(self.dueling))
            else:
                q = tf.identity(a)

            return QModelSpec(graph_outputs=graph_outputs, a=a, v=v, q=q)

        return graph_fn

    def _call_graph_fn(self, features, labels=None):
        """Calls graph function.

        Creates first one or two graph, i.e. train and target graphs.
        Return the optimal action given an exploration policy.

        If `is_dueling` is set to `True`,
        then another layer is added that represents the state value.

        Args:
            features: `Tensor` or `dict` of tensors
            labels: `Tensor` or `dict` of tensors
        """
        graph_fn = self._build_graph_fn()

        if self.use_target_graph:
            # We create 2 graphs: a training graph and a target graph,
            # so that we can copy one graph to another given a frequency.
            self._train_graph = FunctionModule(
                mode=self.mode, build_fn=graph_fn, name='train')
            self._train_results = self._train_graph(features=features, labels=labels)
            self._target_graph = FunctionModule(
                mode=self.mode, build_fn=graph_fn, name='target')
            self._target_results = self._target_graph(features=features, labels=labels)
            return self._build_actions()
        else:
            self._train_results = graph_fn(mode=self.mode, features=features, labels=labels)
            self._target_results = self._train_results
            return self._build_actions()

    def _build_update_target_graph(self):
        """Creates a copy operation from train graph to target graph."""
        if self.use_target_graph:
            return self._train_graph.copy_from(self._train_graph)
        else:
            return None

    def _build_train_op(self, loss):
        """Creates the training operation,

        In case of use_target_network == True, we append also the update op
        while taking into account the update_frequency.
        """
        train_op = super(BaseQModel, self)._build_train_op(loss)

        # check if we need to update the target graph
        if self.use_target_graph:
            update_op = tf.cond(
                tf.equal(tf.mod(training.get_global_step(), self.target_update_frequency), 0),
                self._build_update_target_graph,
                lambda: tf.no_op(name='no_op_copy_target'))

            # append the target update op to the train op.
            train_op = tf.group(*[train_op, update_op], name='train_and_update_target')

        return train_op

    def _build_predictions(self, results, features, labels):
        """Creates the dictionary of predictions that is returned by the model."""
        predictions = super(BaseQModel, self)._build_predictions(
            results=results, features=features, labels=labels)
        # Add The QModelSpec values to predictions
        predictions['graph_results'] = self._train_results.graph_outputs
        predictions['a'] = self._train_results.a
        predictions['q'] = self._train_results.q
        if self._train_results.v is not None:
            predictions['v'] = self._train_results.v
        return predictions
예제 #5
0
class RLBaseModel(BaseModel):
    """Base reinforcement learning model class.

    Args:
        mode: `str`, Specifies if this training, evaluation or prediction. See `Modes`.
        graph_fn: Graph function. Follows the signature:
            * Args:
                * `mode`: Specifies if this training, evaluation or prediction. See `Modes`.
                * `inputs`: the feature inputs.
        loss_config: An instance of `LossConfig`.
        num_states: `int`. The number of states.
        num_actions: `int`. The number of actions.
        optimizer_config: An instance of `OptimizerConfig`. Default value `Adam`.
        eval_metrics_config: a list of `MetricConfig` instances.
        discount: `float`. The discount factor on the target Q values.
        exploration_config: An instance `ExplorationConfig`
        use_target_graph: `bool`. To use a second “target” network,
            which we will use to compute target Q values during our updates.
        target_update_frequency: `int`. At which frequency to update the target graph.
            Only used when `use_target_graph` is set tot True.
        is_continuous: `bool`. Is the model built for a continuous or discrete space.
        dueling: `str` or `bool`. To compute separately the advantage and value functions.
            Options:
                * `True`: creates advantage and state value without any further computation.
                * `mean`, `max`, and `naive`: creates advantage and state value, and computes
                  Q = V(s) + A(s, a)
                  where A = A - mean(A) or A = A - max(A) or A = A.
        use_expert_demo: Whether to pretrain the model on a human/expert data.
        summaries: `str` or `list`. The verbosity of the tensorboard visualization.
            Possible values: `all`, `activations`, `loss`, `learning_rate`, `variables`, `gradients`
        clip_gradients: `float`. Gradients  clipping by global norm.
        clip_embed_gradients: `float`. Embedding gradients clipping to a specified value.
        name: `str`, the name of this model, everything will be encapsulated inside this scope.

     Returns:
        `EstimatorSpec`
    """
    def __init__(self,
                 mode,
                 graph_fn,
                 num_states,
                 num_actions,
                 loss_config=None,
                 optimizer_config=None,
                 eval_metrics_config=None,
                 discount=0.97,
                 exploration_config=None,
                 use_target_graph=True,
                 target_update_frequency=5,
                 is_continuous=False,
                 dueling='mean',
                 use_expert_demo=False,
                 summaries='all',
                 clip_gradients=0.5,
                 clip_embed_gradients=0.1,
                 name="Model"):
        self.num_states = num_states
        self.num_actions = num_actions
        self.exploration_config = exploration_config
        self.discount = discount
        self.use_target_graph = use_target_graph
        self.target_update_frequency = target_update_frequency
        self.is_continuous = is_continuous
        self.dueling = dueling
        self.use_expert_demo = use_expert_demo
        loss_config = loss_config or LossConfig(module='huber_loss')
        super(RLBaseModel,
              self).__init__(mode=mode,
                             name=name,
                             model_type=self.Types.RL,
                             graph_fn=graph_fn,
                             loss_config=loss_config,
                             optimizer_config=optimizer_config,
                             eval_metrics_config=eval_metrics_config,
                             summaries=summaries,
                             clip_gradients=clip_gradients,
                             clip_embed_gradients=clip_embed_gradients)

        self._train_graph = None
        self._target_graph = None

    def _build_eval_metrics(self, results, features, labels):
        pass

    def _build_exploration(self):
        """Creates the exploration op.

        TODO: Think about whether we should pass the episode number here or internally by
        changing the optimize_loss function????
        """
        if self.exploration_config:
            return getters.get_exploration(self.exploration_config.module,
                                           **self.exploration_config.params)
        return None

    def _build_actions(self):
        """Create the chosen action with an exploration policy."""
        # TODO: Add possibility to deactivate the exploration in inference mode.
        batch_size = get_tensor_batch_size(self._train_results['q'])
        exploration_size = tf.concat(axis=0, values=[
            batch_size,
        ])

        exploration = self._build_exploration()

        if self.is_continuous:
            if exploration is None:
                return self._train_results['q']

            # use exploration
            return self._train_results['q'] + exploration
        else:
            self._index_action = tf.argmax(self._train_results['q'], axis=1)
            if exploration is None:
                return self._index_action
            # use exploration
            should_explore = tf.random_uniform((), 0, 1) < exploration
            random_actions = tf.random_uniform(exploration_size, 0,
                                               self.num_actions, tf.int64)
            return tf.cond(should_explore, lambda: random_actions,
                           lambda: self._index_action)

    def _build_graph_fn(self):
        """Create the new graph_fn based on the one specified by the user.

        The structure of the graph is the following:
            1 - call the graph specified by the user.
            2 - create the advantage action probabilities, and the state value.
            3 - return the the probabilities, if a dueling method is specified,
                calculate the new probabilities.
        Returns:
            `function`. The graph function.
        """
        def graph_fn(mode, inputs):
            graph_results = self._graph_fn(mode=mode, inputs=inputs)
            a = FullyConnected(mode, num_units=self.num_actions)(graph_results)
            v = None
            q = a
            if self.dueling is not None:
                # Q = V(s) + A(s, a)
                v = FullyConnected(mode, num_units=1)(graph_results)
                if self.dueling == 'mean':
                    q = v + (a - tf.reduce_mean(a, axis=1, keep_dims=True))
                elif self.dueling == 'max':
                    q = v + (a - tf.reduce_max(a, axis=1, keep_dims=True))
                elif self.dueling == 'naive':
                    q = v + a
                elif self.dueling is True:
                    q = a

            return {'graph_results': graph_results, 'a': a, 'v': v, 'q': q}

        return graph_fn

    def _call_graph_fn(self, inputs):
        """Calls graph function.

        Creates first one or two graph, i.e. train and target graphs.
        Return the optimal action given an exploration policy.

        If `is_dueling` is set to `True`,
        then another layer is added that represents the state value.

        Args:
            inputs: `Tensor` or `dict` of tensors
        """
        graph_fn = self._build_graph_fn()

        if self.use_target_graph:
            # We create 2 graphs: a training graph and a target graph,
            # so that we can copy one graph to another given a frequency.
            self._train_graph = FunctionModule(mode=self.mode,
                                               build_fn=graph_fn,
                                               name='train')
            self._train_results = self._train_graph(inputs=inputs)
            self._target_graph = FunctionModule(mode=self.mode,
                                                build_fn=graph_fn,
                                                name='target')
            self._target_results = self._target_graph(inputs=inputs)
            return self._build_actions()
        else:
            self._train_results = graph_fn(mode=self.mode, inputs=inputs)
            self._target_results = self._train_results
            return self._build_actions()

    def _build_update_target_graph(self):
        """Creates a copy operation from train graph to target graph."""
        if self.use_target_graph:
            return self._train_graph.copy_from(self._train_graph)
        else:
            return None

    def _build_train_op(self, loss):
        """Creates the training operation,

        In case of use_target_network == True, we append also the update op
        while taking into account the update_frequency.
        """
        # TODO: override optimize loss to include episodes and timesteps numbers
        train_op = super(RLBaseModel, self)._build_train_op(loss)

        # check if we need to update the target graph
        if self.use_target_graph:
            update_op = tf.cond(
                tf.equal(
                    tf.mod(training.get_global_step(),
                           self.target_update_frequency),
                    0), self._build_update_target_graph,
                lambda: tf.no_op(name='no_op_copy_target'))
            # append the target update op to the train op.
            train_op = tf.group(*[train_op, update_op],
                                name='train_and_update_target')

        return train_op

    def _preprocess(self, features, labels):
        """Model specific preprocessing.

        Args:
            features: `array`, `Tensor` or `dict`. The environment states.
                if `dict` it must contain a `state` key.
            labels: `dict`. A dictionary containing `action`, `reward`, `advantage`.
        """
        if isinstance(features, Mapping) and 'state' not in features:
            raise KeyError("features must include a `state` key.")

        if (not Modes.is_infer(self.mode) and 'action' not in labels
                or 'reward' not in labels or 'done' not in labels):
            raise KeyError(
                "labels must include these keys: `action`, `reward`, `done`.")
        return features, labels

    def _build_summary_op(self, results=None, features=None, labels=None):
        summary_op = super(RLBaseModel,
                           self)._build_summary_op(results, features, labels)
        for summary in self.summaries:
            if summary == summarizer.SummaryOptions.EXPLORATION:
                summary_op += summarizer.add_exploration_rate_summaries()
            if summary == summarizer.SummaryOptions.REWARD:
                max_reward = labels.get('max_reward', None)
                min_reward = labels.get('min_reward', None)
                avg_reward = labels.get('avg_reward', None)
                total_reward = labels.get('total_reward', None)
                summary_op += summarizer.add_reward_summaries(
                    max_reward, min_reward, avg_reward, total_reward)

        return summary_op