def build_model(self):
        with tf.variable_scope(self._name_scope):
            self._build_ph()

            self._tensor = {}

            # Important parameters
            self._ppo_clip = self.params.ppo_clip
            self._kl_eta = self.params.kl_eta
            self._current_kl_lambda = 1
            self._current_lr = self.params.policy_lr
            self._timesteps_so_far = 0

            # construct the input to the forward network, we normalize the state
            # input, and concatenate with the action
            self._tensor['normalized_start_state'] = (
                self._input_ph['start_state'] -
                self._whitening_operator['state_mean']
            ) / self._whitening_operator['state_std']
            self._tensor['net_input'] = self._tensor['normalized_start_state']
            # the mlp for policy
            network_shape = [self._observation_size] + \
                self.params.policy_network_shape + [self._action_size]
            num_layer = len(network_shape) - 1
            act_type = \
                [self.params.policy_activation_type] * (num_layer - 1) + [None]
            norm_type = \
                [self.params.policy_normalizer_type] * (num_layer - 1) + [None]
            init_data = []
            for _ in range(num_layer):
                init_data.append(
                    {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0},
                     'b_init_method': 'constant', 'b_init_para': {'val': 0.0}}
                )
            init_data[-1]['w_init_para']['stddev'] = 0.01  # the output layer std

            if self.init_weights is not None:
                self._MLP = network_util.MLP(
                    dims=network_shape, scope='policy_mlp', train=True,
                    activation_type=act_type, normalizer_type=norm_type,
                    init_data=init_data, linear_last_layer=True,
                    init_weights = self.init_weights['policy']
                )
            else:
                self._MLP = network_util.MLP(
                    dims=network_shape, scope='policy_mlp', train=True,
                    activation_type=act_type, normalizer_type=norm_type,
                    init_data=init_data, linear_last_layer=True
                )
            # the output policy of the network
            self._tensor['action_dist_mu'] = self._MLP(self._tensor['net_input'])
            self._tensor['action_logstd'] = tf.Variable(
                (0 * self._npr.randn(1, self._action_size)).astype(np.float32),
                name="action_logstd", trainable=True
            )

            self._tensor['action_dist_logstd'] = tf.tile(
                self._tensor['action_logstd'],
                [tf.shape(self._tensor['action_dist_mu'])[0], 1]
            )
示例#2
0
    def _build_softq_network_and_loss(self):
        # build the placeholder for training the value function
        self._input_ph['softq_target'] = \
            tf.placeholder(tf.float32, [None, 1], name='value_target')

        # build the baseline-value function
        network_shape = [self._observation_size + self._action_size] + \
                        self.params.softq_network_shape + [1]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.params.softq_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.params.softq_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append({
                'w_init_method': 'normc',
                'w_init_para': {
                    'stddev': 1.0
                },
                'b_init_method': 'constant',
                'b_init_para': {
                    'val': 0.0
                }
            })
        self._softq_MLP_one = network_util.MLP(dims=network_shape,
                                               scope='softq_mlp_1',
                                               train=True,
                                               activation_type=act_type,
                                               normalizer_type=norm_type,
                                               init_data=init_data,
                                               linear_last_layer=True)

        self._softq_MLP_two = network_util.MLP(dims=network_shape,
                                               scope='softq_mlp_2',
                                               train=True,
                                               activation_type=act_type,
                                               normalizer_type=norm_type,
                                               init_data=init_data,
                                               linear_last_layer=True)

        self._tensor['softq_1_weights'] = self._softq_MLP_one._w
        self._tensor['softq_1_b'] = self._softq_MLP_one._b

        self._tensor['softq_2_weights'] = self._softq_MLP_two._w
        self._tensor['softq_2_b'] = self._softq_MLP_two._b

        self._tensor['combined_softq_weights'] = self._tensor['softq_2_b'] + \
                self._tensor['softq_2_weights'] + \
                self._tensor['softq_1_b'] + self._tensor['softq_1_weights']

        self._tensor['q_input'] = tf.concat(
            [self._tensor['net_input'], self._input_ph['action']], axis=0)

        self._tensor['pred_softq_1'] = self._softq_MLP_one(
            self._tensor['q_input'])
        self._tensor['pred_softq_2'] = self._softq_MLP_two(
            self._tensor['q_input'])

        self._update_operator['softq_1_loss'] = .5 * tf.reduce_mean(
            tf.square(self._tensor['pred_softq_1'] -
                      self._input_ph['softq_target']))
        self._update_operator['softq_2_loss'] = .5 * tf.reduce_mean(
            tf.square(self._tensor['pred_softq_2'] -
                      self._input_ph['softq_target']))

        self._update_operator['softq_loss'] = self._update_operator['softq_1_loss'] + \
            self._update_operator['softq_2_loss']

        self._update_operator['softq_update_op'] = tf.train.AdamOptimizer(
            learning_rate=self.params.softq_lr,
            beta1=0.5,
            beta2=0.99,
            epsilon=1e-4).minimize(self._update_operator['softq_loss'])
示例#3
0
    def _build_value_network_and_loss(self):
        # build the placeholder for training the value function
        self._input_ph['value_target'] = \
            tf.placeholder(tf.float32, [None, 1], name='value_target')

        self._input_ph['old_values'] = \
            tf.placeholder(tf.float32, [None, 1], name='old_value_est')

        # build the baseline-value function
        network_shape = [self._observation_size] + \
            self.params.value_network_shape + [1]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.params.value_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.params.value_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append(
                {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0},
                 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}}
            )

        if self.params.use_subtask_value:
            self._value_MLP = network_util.SparseMultitaskMLP(
                dims=network_shape, scope='value_mlp', train=True,
                activation_type=act_type, normalizer_type=norm_type,
                init_data=init_data, linear_last_layer=True,
                num_tasks=self.params.num_subtasks
            )

            for i in range(self.params.num_subtasks):
                self._tensor['value_weights_{}'.format(TASKS(i))] = self._value_MLP._w[i]
                self._tensor['value_masks_{}'.format(TASKS(i))] = self._value_MLP._sparse_mask[i]

            self._tensor['value_b'] = self._value_MLP._b

            output = self._value_MLP(self._tensor['net_input'])

            self.value_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.params.value_lr,
                beta1=0.5, beta2=0.99, epsilon=1e-4
            )

            for i in range(self.params.num_subtasks):
                self._tensor['pred_value_{}'.format(TASKS(i))] = output[i]

                self._update_operator['vf_loss_{}'.format(TASKS(i))] = .5 * tf.reduce_mean(
                    tf.square(
                        self._tensor['pred_value_{}'.format(TASKS(i))] - self._input_ph['value_target']
                    )
                )

                self._update_operator['sparse_value_correlation_loss_{}'] = \
                    tf_util.correlation_loss(self._tensor['value_masks_{}'.format(TASKS(i))],
                     [self._tensor['value_masks_{}'.format(TASKS(j))] \
                      for j in range(self.params.num_subtasks) if j != i])

                self._update_operator['sparse_value_mask_loss_{}'] = \
                    tf_util.l2_loss(self._tensor['value_masks_{}'.format(TASKS(i))],
                        apply_sigmoid=True)

                self._update_operator['vf_loss_{}'.format(TASKS(i))] += \
                    self._update_operator['sparse_value_correlation_loss_{}'] * \
                    self.params.correlation_coefficient + \
                    self._update_operator['sparse_value_mask_loss_{}'] * \
                    self.params.mask_penalty

                self._update_operator['vf_update_op_{}'.format(TASKS(i))] = \
                    self.value_optimizer.minimize(self._update_operator['vf_loss_{}'.format(TASKS(i))])

                self._tensor['variable_list_{}'.format(TASKS(i))] = [
                    *self._tensor['policy_weights_{}'.format(TASKS(i))],
                    *self._tensor['policy_b'],
                    *self._tensor['value_weights_{}'.format(TASKS(i))],
                    *self._tensor['value_b'],
                    self._tensor['action_logstd']
                ]

        else:
            self._value_MLP = network_util.MLP(
                dims=network_shape, scope='value_mlp', train=True,
                activation_type=act_type, normalizer_type=norm_type,
                init_data=init_data, linear_last_layer=True,
            )

            self._tensor['value_weights'] = self._value_MLP._w
            self._tensor['value_b'] = self._value_MLP._b

            self._tensor['pred_value'] = self._value_MLP(self._tensor['net_input'])

            self.value_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.params.value_lr,
                beta1=0.5, beta2=0.99, epsilon=1e-4
            )
            self._update_operator['vf_loss'] = .5 * tf.reduce_mean(
                tf.square(
                    self._tensor['pred_value'] - self._input_ph['value_target']
                )
            )

            self._update_operator['vf_update_op'] = \
                self.value_optimizer.minimize(self._update_operator['vf_loss'])

            for i in range(self.params.num_subtasks):
                self._tensor['variable_list_{}'.format(TASKS(i))] = [
                    *self._tensor['policy_weights_{}'.format(TASKS(i))],
                    *self._tensor['policy_b'],
                    *self._tensor['value_weights'],
                    *self._tensor['value_b'],
                    self._tensor['action_logstd']
                ]
    def _build_value_network_and_loss(self):
        # build the placeholder for training the value function
        self._input_ph['value_target'] = \
            tf.placeholder(tf.float32, [None, 1], name='value_target')

        self._input_ph['old_values'] = \
            tf.placeholder(tf.float32, [None, 1], name='old_value_est')

        # build the baseline-value function
        network_shape = [self._observation_size] + \
            self.params.value_network_shape + [1]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.params.value_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.params.value_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append(
                {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0},
                 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}}
            )

        if self.init_weights is not None:
            self._value_MLP = network_util.MLP(
                dims=network_shape, scope='value_mlp', train=True,
                activation_type=act_type, normalizer_type=norm_type,
                init_data=init_data, linear_last_layer=True,
                init_weights = self.init_weights['value']
            )
        else:
            self._value_MLP = network_util.MLP(
                dims=network_shape, scope='value_mlp', train=True,
                activation_type=act_type, normalizer_type=norm_type,
                init_data=init_data, linear_last_layer=True
            )
        self._tensor['pred_value'] = self._value_MLP(self._tensor['net_input'])
        # build the loss for the value network
#        self._tensor['val_clipped'] = \
#            self._input_ph['old_values'] + tf.clip_by_value(
#            self._tensor['pred_value'] -
#            self._input_ph['old_values'],
#            -self._ppo_clip, self._ppo_clip)
#        self._tensor['val_loss_clipped'] = tf.square(
#            self._tensor['val_clipped'] - self._input_ph['value_target']
#        )
#        self._tensor['val_loss_unclipped'] = tf.square(
#            self._tensor['pred_value'] - self._input_ph['value_target']
#        )
#
#
#        self._update_operator['vf_loss'] = .5 * tf.reduce_mean(
#            tf.maximum(self._tensor['val_loss_clipped'],
#                       self._tensor['val_loss_unclipped'])
#        )

        self._update_operator['vf_loss'] = .5 * tf.reduce_mean(
            tf.square(
                self._tensor['pred_value'] - self._input_ph['value_target']
            )
        )

        self._update_operator['vf_update_op'] = tf.train.AdamOptimizer(
            learning_rate=self.params.value_lr,
            beta1=0.5, beta2=0.99, epsilon=1e-4
        ).minimize(self._update_operator['vf_loss'])