def build_model(self): with tf.variable_scope(self._name_scope): self._build_ph() self._tensor = {} # Important parameters self._ppo_clip = self.params.ppo_clip self._kl_eta = self.params.kl_eta self._current_kl_lambda = 1 self._current_lr = self.params.policy_lr self._timesteps_so_far = 0 # construct the input to the forward network, we normalize the state # input, and concatenate with the action self._tensor['normalized_start_state'] = ( self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['net_input'] = self._tensor['normalized_start_state'] # the mlp for policy network_shape = [self._observation_size] + \ self.params.policy_network_shape + [self._action_size] num_layer = len(network_shape) - 1 act_type = \ [self.params.policy_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.params.policy_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append( {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} ) init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std if self.init_weights is not None: self._MLP = network_util.MLP( dims=network_shape, scope='policy_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True, init_weights = self.init_weights['policy'] ) else: self._MLP = network_util.MLP( dims=network_shape, scope='policy_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True ) # the output policy of the network self._tensor['action_dist_mu'] = self._MLP(self._tensor['net_input']) self._tensor['action_logstd'] = tf.Variable( (0 * self._npr.randn(1, self._action_size)).astype(np.float32), name="action_logstd", trainable=True ) self._tensor['action_dist_logstd'] = tf.tile( self._tensor['action_logstd'], [tf.shape(self._tensor['action_dist_mu'])[0], 1] )
def _build_softq_network_and_loss(self): # build the placeholder for training the value function self._input_ph['softq_target'] = \ tf.placeholder(tf.float32, [None, 1], name='value_target') # build the baseline-value function network_shape = [self._observation_size + self._action_size] + \ self.params.softq_network_shape + [1] num_layer = len(network_shape) - 1 act_type = \ [self.params.softq_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.params.softq_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append({ 'w_init_method': 'normc', 'w_init_para': { 'stddev': 1.0 }, 'b_init_method': 'constant', 'b_init_para': { 'val': 0.0 } }) self._softq_MLP_one = network_util.MLP(dims=network_shape, scope='softq_mlp_1', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True) self._softq_MLP_two = network_util.MLP(dims=network_shape, scope='softq_mlp_2', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True) self._tensor['softq_1_weights'] = self._softq_MLP_one._w self._tensor['softq_1_b'] = self._softq_MLP_one._b self._tensor['softq_2_weights'] = self._softq_MLP_two._w self._tensor['softq_2_b'] = self._softq_MLP_two._b self._tensor['combined_softq_weights'] = self._tensor['softq_2_b'] + \ self._tensor['softq_2_weights'] + \ self._tensor['softq_1_b'] + self._tensor['softq_1_weights'] self._tensor['q_input'] = tf.concat( [self._tensor['net_input'], self._input_ph['action']], axis=0) self._tensor['pred_softq_1'] = self._softq_MLP_one( self._tensor['q_input']) self._tensor['pred_softq_2'] = self._softq_MLP_two( self._tensor['q_input']) self._update_operator['softq_1_loss'] = .5 * tf.reduce_mean( tf.square(self._tensor['pred_softq_1'] - self._input_ph['softq_target'])) self._update_operator['softq_2_loss'] = .5 * tf.reduce_mean( tf.square(self._tensor['pred_softq_2'] - self._input_ph['softq_target'])) self._update_operator['softq_loss'] = self._update_operator['softq_1_loss'] + \ self._update_operator['softq_2_loss'] self._update_operator['softq_update_op'] = tf.train.AdamOptimizer( learning_rate=self.params.softq_lr, beta1=0.5, beta2=0.99, epsilon=1e-4).minimize(self._update_operator['softq_loss'])
def _build_value_network_and_loss(self): # build the placeholder for training the value function self._input_ph['value_target'] = \ tf.placeholder(tf.float32, [None, 1], name='value_target') self._input_ph['old_values'] = \ tf.placeholder(tf.float32, [None, 1], name='old_value_est') # build the baseline-value function network_shape = [self._observation_size] + \ self.params.value_network_shape + [1] num_layer = len(network_shape) - 1 act_type = \ [self.params.value_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.params.value_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append( {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} ) if self.params.use_subtask_value: self._value_MLP = network_util.SparseMultitaskMLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True, num_tasks=self.params.num_subtasks ) for i in range(self.params.num_subtasks): self._tensor['value_weights_{}'.format(TASKS(i))] = self._value_MLP._w[i] self._tensor['value_masks_{}'.format(TASKS(i))] = self._value_MLP._sparse_mask[i] self._tensor['value_b'] = self._value_MLP._b output = self._value_MLP(self._tensor['net_input']) self.value_optimizer = tf.train.AdamOptimizer( learning_rate=self.params.value_lr, beta1=0.5, beta2=0.99, epsilon=1e-4 ) for i in range(self.params.num_subtasks): self._tensor['pred_value_{}'.format(TASKS(i))] = output[i] self._update_operator['vf_loss_{}'.format(TASKS(i))] = .5 * tf.reduce_mean( tf.square( self._tensor['pred_value_{}'.format(TASKS(i))] - self._input_ph['value_target'] ) ) self._update_operator['sparse_value_correlation_loss_{}'] = \ tf_util.correlation_loss(self._tensor['value_masks_{}'.format(TASKS(i))], [self._tensor['value_masks_{}'.format(TASKS(j))] \ for j in range(self.params.num_subtasks) if j != i]) self._update_operator['sparse_value_mask_loss_{}'] = \ tf_util.l2_loss(self._tensor['value_masks_{}'.format(TASKS(i))], apply_sigmoid=True) self._update_operator['vf_loss_{}'.format(TASKS(i))] += \ self._update_operator['sparse_value_correlation_loss_{}'] * \ self.params.correlation_coefficient + \ self._update_operator['sparse_value_mask_loss_{}'] * \ self.params.mask_penalty self._update_operator['vf_update_op_{}'.format(TASKS(i))] = \ self.value_optimizer.minimize(self._update_operator['vf_loss_{}'.format(TASKS(i))]) self._tensor['variable_list_{}'.format(TASKS(i))] = [ *self._tensor['policy_weights_{}'.format(TASKS(i))], *self._tensor['policy_b'], *self._tensor['value_weights_{}'.format(TASKS(i))], *self._tensor['value_b'], self._tensor['action_logstd'] ] else: self._value_MLP = network_util.MLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True, ) self._tensor['value_weights'] = self._value_MLP._w self._tensor['value_b'] = self._value_MLP._b self._tensor['pred_value'] = self._value_MLP(self._tensor['net_input']) self.value_optimizer = tf.train.AdamOptimizer( learning_rate=self.params.value_lr, beta1=0.5, beta2=0.99, epsilon=1e-4 ) self._update_operator['vf_loss'] = .5 * tf.reduce_mean( tf.square( self._tensor['pred_value'] - self._input_ph['value_target'] ) ) self._update_operator['vf_update_op'] = \ self.value_optimizer.minimize(self._update_operator['vf_loss']) for i in range(self.params.num_subtasks): self._tensor['variable_list_{}'.format(TASKS(i))] = [ *self._tensor['policy_weights_{}'.format(TASKS(i))], *self._tensor['policy_b'], *self._tensor['value_weights'], *self._tensor['value_b'], self._tensor['action_logstd'] ]
def _build_value_network_and_loss(self): # build the placeholder for training the value function self._input_ph['value_target'] = \ tf.placeholder(tf.float32, [None, 1], name='value_target') self._input_ph['old_values'] = \ tf.placeholder(tf.float32, [None, 1], name='old_value_est') # build the baseline-value function network_shape = [self._observation_size] + \ self.params.value_network_shape + [1] num_layer = len(network_shape) - 1 act_type = \ [self.params.value_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.params.value_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append( {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} ) if self.init_weights is not None: self._value_MLP = network_util.MLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True, init_weights = self.init_weights['value'] ) else: self._value_MLP = network_util.MLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True ) self._tensor['pred_value'] = self._value_MLP(self._tensor['net_input']) # build the loss for the value network # self._tensor['val_clipped'] = \ # self._input_ph['old_values'] + tf.clip_by_value( # self._tensor['pred_value'] - # self._input_ph['old_values'], # -self._ppo_clip, self._ppo_clip) # self._tensor['val_loss_clipped'] = tf.square( # self._tensor['val_clipped'] - self._input_ph['value_target'] # ) # self._tensor['val_loss_unclipped'] = tf.square( # self._tensor['pred_value'] - self._input_ph['value_target'] # ) # # # self._update_operator['vf_loss'] = .5 * tf.reduce_mean( # tf.maximum(self._tensor['val_loss_clipped'], # self._tensor['val_loss_unclipped']) # ) self._update_operator['vf_loss'] = .5 * tf.reduce_mean( tf.square( self._tensor['pred_value'] - self._input_ph['value_target'] ) ) self._update_operator['vf_update_op'] = tf.train.AdamOptimizer( learning_rate=self.params.value_lr, beta1=0.5, beta2=0.99, epsilon=1e-4 ).minimize(self._update_operator['vf_loss'])