def val(state_input): """V(s) instead of a bias for the last layers.""" with tf.variable_scope("val_for_bias"): val0 = tf.layers.dense(inputs=state_input, units=self.embed_dim, activation=tf.nn.relu) val2 = tf.layers.dense(inputs=val0, units=1, activation=None) return val2
def build_actor_graph(self): """Build explorer graph with minimum principle.""" with self.graph.as_default(): with tf.variable_scope("explore_agent"): self.agent_outs, self.hidden_outs = self.build_agent_net( inputs_obs=self.ph_obs, seq_max=1, # 1, importance for inference obs_lengths=[1 for _ in range(self.n_agents)], hidden_state_in=self.ph_hidden_states_in, ) self._explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")
def hyper_w1(hyper_w1_input): """ Create hyper_w1. input shape (none, state_dim) """ with tf.variable_scope("hyper_w1"): hw0 = tf.layers.dense(inputs=hyper_w1_input, units=hypernet_embed, activation=tf.nn.relu) hw1 = tf.layers.dense(inputs=hw0, units=self.embed_dim * self.n_agents, activation=None) return hw1
def hyper_w_final(hyper_w_final_input): """ Create hyper_w_final. input shape (none, state_dim) """ with tf.variable_scope("hyper_w_final"): hw_f0 = tf.layers.dense( inputs=hyper_w_final_input, units=hypernet_embed, activation=tf.nn.relu, ) hw_f1 = tf.layers.dense(inputs=hw_f0, units=self.embed_dim, activation=None) return hw_f1
def build_train_graph(self): """ Build train graph. Because of the different seq_max(1 vs limit), train graph cannot connect-up to actor.graph directly. Hence, we build an explore sub-graph and train sub-graph, which sync with tf.assign between two collections. :return: """ with self.graph.as_default(): with tf.variable_scope("eval_agent"): trajectory_agent_outs, _ = self.build_agent_net( inputs_obs=self.ph_train_obs, seq_max=self.fix_seq_length + 1, # importance obs_lengths=self.ph_train_obs_len, hidden_state_in=None, # total trajectory, needn't hold hidden ) with tf.variable_scope("target_agent"): tar_agent_outs_tmp, _ = self.build_agent_net( inputs_obs=self.ph_train_obs, # fix value, different between explore and train seq_max=self.fix_seq_length + 1, obs_lengths=self.ph_train_obs_len, hidden_state_in=None, ) target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp) _eval_agent_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent") _target_agent_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent") with tf.variable_scope("soft_replacement"): self.agent_train_replace_op = [ tf.assign(t, e) for t, e in zip(_target_agent_paras, _eval_agent_paras)] self.agent_explore_replace_op = [ tf.assign(t, e) for t, e in zip(self._explore_paras, _eval_agent_paras) ] self._print_trainable_var_name( _eval_agent_paras=_eval_agent_paras, _target_agent_paras=_target_agent_paras, _explore_paras=self._explore_paras, ) # agent out to max q values # Calculate estimated Q-Values ---------------- mac_out = tf.reshape( trajectory_agent_outs, [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1], ) logging.debug("mac_out: {}".format(mac_out)) chosen_action_qvals = self.gather_custom(mac_out[:, :-1], self.ph_actions) # Calculate the Q-Values necessary for the target ----------- target_mac_out = tf.reshape( target_trajectory_agent_outs, [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1], ) target_mac_out = target_mac_out[:, 1:] # Mask out unavailable actions # target_mac_out[avail_actions[:, 1:] == 0] = -9999999 indices = tf.equal(self.ph_avail_action[:, 1:], 0) mask_val = tf.tile( [[[[-999999.0]]]], [ self.batch_size, self.fix_seq_length, self.n_agents, self.avail_action_num, ], ) logging.debug("indices:{}, mask_val:{}, target mac out:{}".format( indices, mask_val, target_mac_out)) target_mac_out = tf.where(indices, mask_val, target_mac_out) if self.use_double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:])) mac_out_detach = tf.where(indices, mask_val, mac_out_detach) cur_max_actions = tf.expand_dims( tf.argmax(mac_out_detach, axis=-1), -1) target_max_qvals = self.gather_custom(target_mac_out, cur_max_actions) else: target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1]) # eval mixer --------------- with tf.variable_scope("eval_mixer"): self.q_tot = self._build_mix_net2(chosen_action_qvals, self.ph_train_states) with tf.variable_scope("target_mixer"): q_tot_tmp = self._build_mix_net2(target_max_qvals, self.ph_train_target_states) self.target_q_tot = tf.stop_gradient(q_tot_tmp) _eval_mix_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer") _target_mix_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer") with tf.variable_scope("soft_replacement"): self.mix_train_replace_op = [ tf.assign(t, e) for t, e in zip(_target_mix_paras, _eval_mix_paras)] self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras, _target_mix_paras=_target_mix_paras) # Calculate 1-step Q-Learning targets targets = (self.ph_rewards + self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot) # Td-error td_error = self.q_tot - tf.stop_gradient(targets) # mask = mask.expand_as(td_error) #fixme: default as same shape! # 0-out the targets that came from padded data masked_td_error = tf.multiply(td_error, self.ph_mask) self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask) # Optimise optimizer = tf.train.RMSPropOptimizer( self.lr, decay=0.95, epsilon=1.5e-7, centered=True) grads_and_vars = optimizer.compute_gradients(self.loss) capped_gvs = [( grad if grad is None else tf.clip_by_norm( grad, clip_norm=self.grad_norm_clip), var, ) for grad, var in grads_and_vars] self.grad_update = optimizer.apply_gradients(capped_gvs)
def hyper_b1(state_input): """State dependent bias for hidden layer.""" with tf.variable_scope("hyper_b1"): return tf.layers.dense(inputs=state_input, units=self.embed_dim, activation=None)
def create_model(self, model_info): self.ph_state = tf.placeholder(self.input_dtype, shape=(None, *self.state_dim), name="state_input") with tf.variable_scope("explore_agent"): state_input = Lambda(self._transform)(self.ph_state) last_layer = state_input for (out_size, kernel, stride) in self.filter_arch[:-1]: last_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="same", )(last_layer) # last convolution (out_size, kernel, stride) = self.filter_arch[-1] convolution_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="valid", )(last_layer) self.pi_logic_outs = tf.squeeze( Conv2D(self.action_dim, (1, 1), padding="same")(convolution_layer), axis=[1, 2], ) baseline_flat = Flatten()(convolution_layer) self.baseline = tf.squeeze( tf.layers.dense( inputs=baseline_flat, units=1, activation=None, kernel_initializer=custom_norm_initializer(0.01), ), 1, ) self.out_actions = tf.squeeze( tf.multinomial(self.pi_logic_outs, num_samples=1, output_dtype=tf.int32), 1, name="out_action", ) # create learner self.ph_bp_logic_outs = tf.placeholder(self.dtype, shape=(None, self.action_dim), name="ph_b_logits") self.ph_actions = tf.placeholder(tf.int32, shape=(None, ), name="ph_action") self.ph_dones = tf.placeholder(tf.bool, shape=(None, ), name="ph_dones") self.ph_rewards = tf.placeholder(self.dtype, shape=(None, ), name="ph_rewards") # Split the tensor into batches at known episode cut boundaries. # [batch_count * batch_step] -> [batch_step, batch_count] batch_step = self.sample_batch_steps def split_batches(tensor, drop_last=False): batch_count = tf.shape(tensor)[0] // batch_step reshape_tensor = tf.reshape( tensor, tf.concat([[batch_count, batch_step], tf.shape(tensor)[1:]], axis=0), ) # swap B and T axes res = tf.transpose( reshape_tensor, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))), ) if drop_last: return res[:-1] return res self.loss = vtrace_loss( bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True), tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True), actions=split_batches(self.ph_actions, drop_last=True), discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) * GAMMA, drop_last=True), rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1), drop_last=True), values=split_batches(self.baseline, drop_last=True), bootstrap_value=split_batches(self.baseline)[-1], ) global_step = tf.Variable(0, trainable=False, dtype=tf.int32) if self.opt_type == "adam": if self.lr_schedule: learning_rate = self._get_lr(global_step) else: learning_rate = LR optimizer = AdamOptimizer(learning_rate) elif self.opt_type == "rmsprop": optimizer = tf.train.RMSPropOptimizer(LR, decay=0.99, epsilon=0.1, centered=True) else: raise KeyError("invalid opt_type: {}".format(self.opt_type)) grads_and_vars = optimizer.compute_gradients(self.loss) # global norm grads, var = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip) clipped_gvs = list(zip(grads, var)) self.train_op = optimizer.apply_gradients(clipped_gvs, global_step=global_step) # fixme: help to show the learning rate among training processing self.lr = optimizer._lr self.actor_var = TFVariables(self.out_actions, self.sess) self.sess.run(global_variables_initializer()) self.explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent") self.saver = Saver({t.name: t for t in self.explore_paras}, max_to_keep=self.max_to_keep) return True