def __init__(self, cfg, training=False): super(ActorCriticMLP, self).__init__() self.model_name = 'ActorCriticMLP' self.cfg = cfg self.training = training # network layers self.hidden1 = nn.Linear(96, 128) self.hidden2 = nn.Linear(128, 256) #self.hidden3 = nn.Linear(256, 256) # actor self.actor_mu = nn.Linear(256, self.cfg.NUM_ACTIONS) self.actor_sigma = nn.Linear(256, self.cfg.NUM_ACTIONS) # critic self.critic = nn.Linear(256, 1) # weight initialisation self.apply(ut.weight_init) self.actor_mu.weight.data = ut.normalized_columns_initializer( self.actor_mu.weight.data, 0.01) self.actor_mu.bias.data.fill_(0) self.actor_sigma.weight.data = ut.normalized_columns_initializer( self.actor_sigma.weight.data, 0.001) self.actor_sigma.bias.data.fill_(0) self.critic.weight.data = ut.normalized_columns_initializer( self.critic.weight.data, 1.0) self.critic.bias.data.fill_(0)
def __init__(self, num_inputs, action_space): super(ActorCritic, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) self.lstm = nn.LSTMCell(32 * 3 * 3, 256) num_outputs = action_space.n self.critic_linear = nn.Linear(256, 1) self.actor_linear = nn.Linear(256, num_outputs) self.apply(weights_init) self.actor_linear.weight.data = normalized_columns_initializer( self.actor_linear.weight.data, 0.01) self.actor_linear.bias.data.fill_(0) self.critic_linear.weight.data = normalized_columns_initializer( self.critic_linear.weight.data, 1.0) self.critic_linear.bias.data.fill_(0) self.lstm.bias_ih.data.fill_(0) self.lstm.bias_hh.data.fill_(0) self.train() if USE_CUDA: self.cuda()
def __init__(self, cfg, training=False, gpu_id=0): super(ActorCriticLSTM, self).__init__() self.model_name = 'ActorCriticLSTM' self.cfg = cfg self.training = training self.gpu_id = gpu_id self.lstm_layers = 1 self.lstm_size = 512 self.conv1 = nn.Conv2d(1, 32, 5, stride=1, padding=2) self.maxp1 = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(32, 32, 5, stride=1, padding=1) self.maxp2 = nn.MaxPool2d(2, 2) self.conv3 = nn.Conv2d(32, 64, 4, stride=1, padding=1) self.maxp3 = nn.MaxPool2d(2, 2) self.conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1) self.maxp4 = nn.MaxPool2d(2, 2) #self.lstm = nn.LSTMCell(256, 256) self.lstm = nn.LSTM(1024, hidden_size=self.lstm_size, num_layers=self.lstm_layers) # actor self.actor = nn.Linear(self.lstm_size, self.cfg.NUM_ACTIONS) # critic self.critic = nn.Linear(self.lstm_size, 1) # weight initialisation self.apply(ut.weight_init) relu_gain = nn.init.calculate_gain('relu') self.conv1.weight.data.mul_(relu_gain) self.conv2.weight.data.mul_(relu_gain) self.conv3.weight.data.mul_(relu_gain) self.conv4.weight.data.mul_(relu_gain) self.actor.weight.data = ut.normalized_columns_initializer( self.actor.weight.data, 0.01) self.actor.bias.data.fill_(0) self.critic.weight.data = ut.normalized_columns_initializer( self.critic.weight.data, 1.0) self.critic.bias.data.fill_(0) """
def __init__(self, cfg, training=False): super(ActorCriticLSTM, self).__init__() self.model_name = 'ActorCriticLSTM' self.cfg = cfg self.training = training self.lstm_layers = 1 self.lstm_size = 128 # network layers self.hidden1 = nn.Linear(8, 128) #self.hidden3 = nn.Linear(256, 256) #self.lstm = nn.LSTMCell(256, 256) self.lstm = nn.LSTM(128, hidden_size=self.lstm_size, num_layers=self.lstm_layers) # actor self.actor_mu = nn.Linear(128, self.cfg.NUM_ACTIONS) self.actor_sigma = nn.Linear(128, self.cfg.NUM_ACTIONS) # critic self.critic = nn.Linear(128, 1) # weight initialisation self.apply(ut.weight_init) self.actor_mu.weight.data = ut.normalized_columns_initializer( self.actor_mu.weight.data, 0.01) self.actor_mu.bias.data.fill_(0) self.actor_sigma.weight.data = ut.normalized_columns_initializer( self.actor_sigma.weight.data, 0.001) self.actor_sigma.bias.data.fill_(0) self.critic.weight.data = ut.normalized_columns_initializer( self.critic.weight.data, 1.0) self.critic.bias.data.fill_(0) """
def __init__(self, num_inputs, num_outputs): super(ActorCriticLSTM, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 5, stride=1, padding=2) self.conv2 = nn.Conv2d(32, 32, 5, stride=1, padding=1) self.conv3 = nn.Conv2d(32, 64, 4, stride=1, padding=1) self.conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1) self.lstm = nn.LSTMCell(1024, 512) self.critic_linear = nn.Linear(512, 1) self.actor_linear = nn.Linear(512, num_outputs) self.apply(weights_init) self.actor_linear.weight.data = normalized_columns_initializer( self.actor_linear.weight.data, 0.01) self.actor_linear.bias.data.fill_(0) self.critic_linear.weight.data = normalized_columns_initializer( self.critic_linear.weight.data, 1.0) self.critic_linear.bias.data.fill_(0) self.lstm.bias_ih.data.fill_(0) self.lstm.bias_hh.data.fill_(0) self.reset()
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): if FLAGS.meta: self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards") self.reward_multiplier = tf.placeholder(shape=[None], dtype=tf.float32, name="reward_multiplier") self.prev_rewards = tf.cast(tf.multiply(self.reward_multiplier, self.prev_rewards), dtype=tf.int32) # one_hot_indices = np.arange(0,1,0.1).tolist()[1:] + [1, 5] # one_hot_rewards = tf.one_hot(indices=one_hot_indices, depth=11, on_value=1, off_value=0, # axis=-1) # self.prev_rewards_onehot = self.prev_rewards[] self.prev_rewards_onehot = tf.one_hot(self.prev_rewards, 12, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.timestep = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="timestep") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") if FLAGS.meta: hidden = tf.concat([self.prev_rewards_onehot, self.prev_actions_onehot, self.timestep], 1, name="Concatenated_input") else: hidden = tf.concat([self.prev_actions_onehot, self.timestep], 1, name="Concatenated_input") lstm_cell = tf.contrib.rnn.BasicLSTMCell(48, state_is_tuple=True) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c], name="c_in") h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h], name="h_in") self.state_in = (c_in, h_in) rnn_in = tf.expand_dims(hidden, [0], name="RNN_input") step_size = tf.shape(self.timestep)[:1] state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out") fc_pol_w = tf.get_variable("FC_Pol_W", shape=[48, FLAGS.nb_actions], initializer=normalized_columns_initializer(0.01)) self.policy = tf.nn.softmax(tf.matmul(rnn_out, fc_pol_w, name="Policy"), name="Policy_soft") fc_value_w = tf.get_variable("FC_Value_W", shape=[48, 1], initializer=normalized_columns_initializer(1.0)) self.value = tf.matmul(rnn_out, fc_value_w, name="Value") if scope != 'global': self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) # Loss functions self.value_loss = FLAGS.beta_v * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy + 1e-7)) starter_beta_e = 1.0 end_beta_e = 0.0 decay_steps = FLAGS.max_nb_episodes_train self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step, decay_steps, end_beta_e, power=0.5) self.policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.advantages) self.loss = self.value_loss + self.policy_loss - self.entropy * self.beta_e local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value) for grad, weight in zip(grads, local_vars): tf.summary.histogram(weight.name + '_grad', grad) tf.summary.histogram(weight.name, weight) self.merged_summary = tf.summary.merge_all() global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): if FLAGS.meta: self.prev_rewards = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Prev_Rewards") self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.timestep = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="timestep") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") if FLAGS.meta: hidden = tf.concat([ self.prev_rewards, self.prev_actions_onehot, self.timestep ], 1, name="Concatenated_input") else: hidden = tf.concat([self.prev_actions_onehot, self.timestep], 1, name="Concatenated_input") lstm_cell = tf.contrib.rnn.BasicLSTMCell(48, state_is_tuple=True) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c], name="c_in") h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h], name="h_in") self.state_in = (c_in, h_in) rnn_in = tf.expand_dims(hidden, [0], name="RNN_input") step_size = tf.shape(self.timestep)[:1] state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out") fc_pol_w = tf.get_variable( "FC_Pol_W", shape=[48, FLAGS.nb_actions], initializer=normalized_columns_initializer(0.01)) self.policy = tf.nn.softmax(tf.matmul(rnn_out, fc_pol_w, name="Policy"), name="Policy_soft") fc_value_w = tf.get_variable( "FC_Value_W", shape=[48, 1], initializer=normalized_columns_initializer(1.0)) self.value = tf.matmul(rnn_out, fc_value_w, name="Value") if scope != 'global': self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) self.responsible_outputs = tf.reduce_sum( self.policy * self.actions_onehot, [1]) # Loss functions self.value_loss = FLAGS.beta_v * tf.reduce_sum( tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy = -tf.reduce_sum( self.policy * tf.log(self.policy + 1e-7)) starter_beta_e = 1.0 end_beta_e = 0.0 decay_steps = 20000 self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step, decay_steps, end_beta_e, power=0.5) self.policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.advantages) self.loss = self.value_loss + self.policy_loss - self.entropy * self.beta_e local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, FLAGS.gradient_clip_value) for grad, weight in zip(grads, local_vars): tf.summary.histogram(weight.name + '_grad', grad) tf.summary.histogram(weight.name, weight) self.merged_summary = tf.summary.merge_all() global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): self.prob_of_random_goal = tf.Variable(FLAGS.initial_random_goal_prob, trainable=False, name="prob_of_random_goal", dtype=tf.float32) self.inputs = tf.placeholder(shape=[None, FLAGS.resized_height, FLAGS.resized_width, FLAGS.agent_history_length], dtype=tf.float32, name="Inputs") self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards") self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards, dtype=tf.int32), 2, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_rewards = tf.expand_dims(self.prev_rewards, 1, name="rewards") # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0) self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_Goals") self.image_summaries = [] if FLAGS.game not in flags.SUPPORTED_ENVS: self.conv0 = tf.contrib.layers.conv2d( self.inputs, 16, 8, 4, activation_fn=tf.nn.elu, scope="conv0") with tf.variable_scope('conv0'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) self.conv = tf.contrib.layers.conv2d( self.conv0, 32, 4, 2, activation_fn=tf.nn.elu, scope="conv1") else: self.conv = tf.contrib.layers.conv2d( self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1") with tf.variable_scope('conv1'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) with tf.variable_scope('inputs'): tf.get_variable_scope().reuse_variables() self.image_summaries.append( tf.summary.image('input', self.inputs, max_outputs=100)) self.conv_flat = tf.contrib.layers.flatten(self.conv) self.fc = tf.contrib.layers.fully_connected(self.conv_flat, FLAGS.hidden_dim) self.fc = tf.contrib.layers.layer_norm(self.fc) self.f_percept = tf.nn.elu(self.fc, name="Zt") if FLAGS.game not in flags.SUPPORTED_ENVS: self.f_percept = tf.concat( [self.f_percept, self.prev_rewards], 1, name="Zt_r") else: self.f_percept = tf.concat( [self.f_percept, self.prev_rewards_onehot], 1, name="Zt_r") summary_f_percept_act = tf.contrib.layers.summarize_activation(self.f_percept) ############################################################################################################ # Manager network if FLAGS.meta: self.f_Mspace = tf.concat( [self.f_percept, self.prev_goal], 1, name="Zt_r") else: self.f_Mspace = tf.identity(self.f_percept, name="Zt_r") self.f_Mspace = tf.contrib.layers.fully_connected(self.f_Mspace, FLAGS.hidden_dim) self.f_percept = tf.concat( [self.f_percept, self.prev_actions_onehot], 1, name="Zt_r") self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace) self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St") summary_f_Mspace_act = tf.contrib.layers.summarize_activation(self.f_Mspace) m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in") step_size = tf.shape(self.inputs)[:1] m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.hidden_dim) m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) self.m_state_init = [m_c_init, m_h_init] m_c_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_c_in") m_h_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_h_in") self.m_state_in = (m_c_in, m_h_in) m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in) m_lstm_outputs, m_lstm_state = self.fast_dlstm(m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon, FLAGS.hidden_dim * FLAGS.manager_horizon) m_lstm_c, m_lstm_h = m_lstm_state self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :]) self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim]) self.normalized_goals = tf.contrib.layers.fully_connected(self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt") summary_goals = tf.contrib.layers.summarize_activation(self.normalized_goals) def randomize_goals(t): t = tf.cast(t, tf.int32) packed_tensors = tf.stack([tf.random_normal([FLAGS.hidden_dim, ]), self.normalized_goals[t, :]]) to_update = tf.cond( tf.less(self.prob_of_random_goal, tf.constant(FLAGS.final_random_goal_prob, dtype=tf.float32)), lambda: tf.cast( tf.multinomial( tf.log([[self.prob_of_random_goal, tf.subtract(tf.constant(1.0), self.prob_of_random_goal)]]), 1)[0][0], tf.int32), lambda: tf.constant(1, tf.int32)) resulted_tensor = tf.gather(packed_tensors, to_update) return resulted_tensor self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t), tf.to_float(tf.range(0, step_size[0])), name="random_gt") summary_random_goals = tf.contrib.layers.summarize_activation(self.randomized_goals) self.decrease_prob_of_random_goal = tf.assign_sub(self.prob_of_random_goal, tf.constant( (FLAGS.initial_random_goal_prob - FLAGS.final_random_goal_prob) / FLAGS.explore_steps)) m_fc_value_w = tf.get_variable("M_Value_W", shape=[FLAGS.hidden_dim, 1], initializer=normalized_columns_initializer(1.0)) self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value") summary_m_value_act = tf.contrib.layers.summarize_activation(self.m_value) ############################################################################################################ # Worker network self.sum_prev_goals = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_c_Goals_sum") w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in") step_size = tf.shape(self.inputs)[:1] w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.goal_embedding_size * FLAGS.nb_actions) w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32) w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32) self.w_state_init = [w_c_init, w_h_init] w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c], name="Wrnn_c_in") w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h], name="Wrnn_h_in") self.w_state_in = (w_c_in, w_h_in) w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in) w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn( w_lstm_cell, w_rnn_in, initial_state=w_state_in, sequence_length=step_size, time_major=False) w_lstm_c, w_lstm_h = w_lstm_state self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :]) Ut = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size], name="Ut") Ut_flat = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size], name="Ut_flat") summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut) goal_encoding = tf.contrib.layers.fully_connected(self.sum_prev_goals, FLAGS.goal_embedding_size, biases_initializer=None, scope="goal_emb") interm_rez = tf.squeeze(tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2) interm_rez = tf.contrib.layers.flatten(interm_rez) self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy") summary_w_policy_act = tf.contrib.layers.summarize_activation(self.w_policy) w_fc_value_w = tf.get_variable("W_Value_W", shape=[FLAGS.nb_actions * FLAGS.goal_embedding_size + FLAGS.goal_embedding_size, 1], initializer=normalized_columns_initializer(1.0)) self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1), w_fc_value_w, name="W_Value") summary_w_value_act = tf.contrib.layers.summarize_activation(self.w_value) if scope != 'global': self.w_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.m_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.w_intrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) def gather_state_at_horiz(t): t = tf.cast(t, tf.int32) f_Mspace_c = tf.gather(self.f_Mspace, tf.minimum(t + tf.constant(FLAGS.manager_horizon, dtype=tf.int32), step_size[0] - 1)) return f_Mspace_c self.f_Mspace_c = tf.cast( tf.map_fn(lambda t: gather_state_at_horiz(t), tf.to_float(tf.range(0, step_size[0])), name="state_at_horiz"), dtype=tf.float32) self.state_diff = self.f_Mspace_c - self.f_Mspace self.cos_sim_state_diff = self.cosine_distance(tf.stop_gradient(self.state_diff), self.normalized_goals, dim=1) self.m_advantages = self.m_extrinsic_return - tf.stop_gradient(tf.reshape(self.m_value, [-1])) self.goals_loss = - tf.reduce_sum(self.m_advantages * self.cos_sim_state_diff) self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum( tf.square(self.m_extrinsic_return - tf.reshape(self.m_value, [-1]))) self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.responsible_outputs = tf.reduce_sum(self.w_policy * self.actions_onehot, [1]) self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return self.total_return = self.w_extrinsic_return + self.intrinsic_return self.w_advantages = self.total_return - tf.stop_gradient(tf.reshape(self.w_value, [-1])) # Loss functions self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum( tf.square(self.total_return - tf.reshape(self.w_value, [-1]))) self.entropy = - tf.reduce_sum(self.w_policy * tf.log(self.w_policy + 1e-7)) self.w_policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.w_advantages) - self.entropy * FLAGS.beta_e self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value) self.worker_summaries = [summary_f_percept_act, summary_f_Mspace_act, summary_goals, summary_random_goals, summary_m_value_act, summary_wrnn_act, summary_w_policy_act, summary_w_value_act] for grad, weight in zip(grads, local_vars): self.worker_summaries.append(tf.summary.histogram(weight.name + '_grad', grad)) self.worker_summaries.append(tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(self.worker_summaries) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): self.inputs = tf.placeholder(shape=[None, FLAGS.game_size, FLAGS.game_size, FLAGS.game_channels], dtype=tf.float32, name="Inputs") self.conv = tf.contrib.layers.conv2d( self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1") self.image_summaries = [] with tf.variable_scope('conv1'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) with tf.variable_scope('inputs'): tf.get_variable_scope().reuse_variables() self.image_summaries.append( tf.summary.image('input', self.inputs, max_outputs=1)) self.fc = tf.contrib.layers.fully_connected(tf.contrib.layers.flatten(self.conv), 64) # self.conv = tf.contrib.layers.layer_norm(self.conv) self.elu = tf.nn.elu(self.fc) summary_conv_act = tf.contrib.layers.summarize_activation(self.elu) if FLAGS.meta: self.timestep = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="timestep") self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Rewards") self.prev_rewards_onehot = tf.one_hot(self.prev_rewards, 2, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") if FLAGS.one_hot_reward: hidden = tf.concat([self.elu, self.prev_rewards_onehot, self.prev_actions_onehot], 1, name="Concatenated_input") else: hidden = tf.concat([self.elu, self.prev_rewards, self.prev_actions_onehot, self.timestep], 1, name="Concatenated_input") else: hidden = self.elu summary_hidden_act = tf.contrib.layers.summarize_activation(hidden) rnn_in = tf.expand_dims(hidden, [0], name="RNN_input") step_size = tf.shape(self.inputs)[:1] if FLAGS.fw: rnn_cell = LayerNormFastWeightsBasicRNNCell(48) # self.initial_state = rnn_cell.zero_state(tf.shape(self.inputs)[0], tf.float32) # self.initial_fast_weights = rnn_cell.zero_fast_weights(tf.shape(self.inputs)[0], tf.float32) h_init = np.zeros((1, 48), np.float32) fw_init = np.zeros((1, 48, 48), np.float32) self.state_init = [h_init, fw_init] h_in = tf.placeholder(tf.float32, [1, 48], name="hidden_state") fw_in = tf.placeholder(tf.float32, [1, 48, 48], name="fast_weights") self.state_in = (h_in, fw_in) rnn_outputs, rnn_state = tf.nn.dynamic_rnn( rnn_cell, rnn_in, initial_state=self.state_in, sequence_length=step_size, time_major=False) rnn_h, rnn_fw = rnn_state self.state_out = (rnn_h[:1, :], rnn_fw[:1, :]) rnn_out = tf.reshape(rnn_outputs, [-1, 48], name="RNN_out") else: lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(48) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c], name="c_in") h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h], name="h_in") self.state_in = (c_in, h_in) state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out") summary_rnn_act = tf.contrib.layers.summarize_activation(rnn_out) fc_pol_w = tf.get_variable("FC_Pol_W", shape=[48, FLAGS.nb_actions], initializer=normalized_columns_initializer(0.01)) self.policy = tf.nn.softmax(tf.matmul(rnn_out, fc_pol_w, name="Policy"), name="Policy_soft") summary_policy_act = tf.contrib.layers.summarize_activation(self.policy) fc_value_w = tf.get_variable("FC_Value_W", shape=[48, 1], initializer=normalized_columns_initializer(1.0)) self.value = tf.matmul(rnn_out, fc_value_w, name="Value") summary_value_act = tf.contrib.layers.summarize_activation(self.value) if scope != 'global': self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) # Loss functions self.value_loss = FLAGS.beta_v * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy + 1e-7)) # starter_beta_e = 1.0 # end_beta_e = 0.0 # decay_steps = 20000 # self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step, # decay_steps, end_beta_e, # power=0.5) self.policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.advantages) - self.entropy * FLAGS.beta_e self.loss = self.value_loss + self.policy_loss local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value) self.worker_summaries = [summary_conv_act, summary_hidden_act, summary_rnn_act, summary_policy_act, summary_value_act] for grad, weight in zip(grads, local_vars): self.worker_summaries.append(tf.summary.histogram(weight.name + '_grad', grad)) self.worker_summaries.append(tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(self.worker_summaries) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): self.inputs = tf.placeholder(shape=[ None, FLAGS.game_size, FLAGS.game_size, FLAGS.game_channels ], dtype=tf.float32, name="Inputs") self.conv = tf.contrib.layers.conv2d(self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1") self.image_summaries = [] with tf.variable_scope('conv1'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) with tf.variable_scope('inputs'): tf.get_variable_scope().reuse_variables() self.image_summaries.append( tf.summary.image('input', self.inputs, max_outputs=1)) self.fc = tf.contrib.layers.fully_connected( tf.contrib.layers.flatten(self.conv), 64) # self.conv = tf.contrib.layers.layer_norm(self.conv) self.elu = tf.nn.elu(self.fc) summary_conv_act = tf.contrib.layers.summarize_activation(self.elu) if FLAGS.meta: self.timestep = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="timestep") self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Rewards") self.prev_rewards_onehot = tf.one_hot( self.prev_rewards, 2, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.prev_actions_onehot = tf.one_hot( self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") if FLAGS.one_hot_reward: hidden = tf.concat([ self.elu, self.prev_rewards_onehot, self.prev_actions_onehot ], 1, name="Concatenated_input") else: hidden = tf.concat([ self.elu, self.prev_rewards, self.prev_actions_onehot, self.timestep ], 1, name="Concatenated_input") else: hidden = self.elu summary_hidden_act = tf.contrib.layers.summarize_activation(hidden) rnn_in = tf.expand_dims(hidden, [0], name="RNN_input") step_size = tf.shape(self.inputs)[:1] if FLAGS.fw: rnn_cell = LayerNormFastWeightsBasicRNNCell(48) # self.initial_state = rnn_cell.zero_state(tf.shape(self.inputs)[0], tf.float32) # self.initial_fast_weights = rnn_cell.zero_fast_weights(tf.shape(self.inputs)[0], tf.float32) h_init = np.zeros((1, 48), np.float32) fw_init = np.zeros((1, 48, 48), np.float32) self.state_init = [h_init, fw_init] h_in = tf.placeholder(tf.float32, [1, 48], name="hidden_state") fw_in = tf.placeholder(tf.float32, [1, 48, 48], name="fast_weights") self.state_in = (h_in, fw_in) rnn_outputs, rnn_state = tf.nn.dynamic_rnn( rnn_cell, rnn_in, initial_state=self.state_in, sequence_length=step_size, time_major=False) rnn_h, rnn_fw = rnn_state self.state_out = (rnn_h[:1, :], rnn_fw[:1, :]) rnn_out = tf.reshape(rnn_outputs, [-1, 48], name="RNN_out") else: lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(48) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c], name="c_in") h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h], name="h_in") self.state_in = (c_in, h_in) state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 48], name="RNN_out") summary_rnn_act = tf.contrib.layers.summarize_activation(rnn_out) fc_pol_w = tf.get_variable( "FC_Pol_W", shape=[48, FLAGS.nb_actions], initializer=normalized_columns_initializer(0.01)) self.policy = tf.nn.softmax(tf.matmul(rnn_out, fc_pol_w, name="Policy"), name="Policy_soft") summary_policy_act = tf.contrib.layers.summarize_activation( self.policy) fc_value_w = tf.get_variable( "FC_Value_W", shape=[48, 1], initializer=normalized_columns_initializer(1.0)) self.value = tf.matmul(rnn_out, fc_value_w, name="Value") summary_value_act = tf.contrib.layers.summarize_activation( self.value) if scope != 'global': self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) self.responsible_outputs = tf.reduce_sum( self.policy * self.actions_onehot, [1]) # Loss functions self.value_loss = FLAGS.beta_v * tf.reduce_sum( tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy = -tf.reduce_sum( self.policy * tf.log(self.policy + 1e-7)) # starter_beta_e = 1.0 # end_beta_e = 0.0 # decay_steps = 20000 # self.beta_e = tf.train.polynomial_decay(starter_beta_e, global_step, # decay_steps, end_beta_e, # power=0.5) self.policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.advantages) - self.entropy * FLAGS.beta_e self.loss = self.value_loss + self.policy_loss local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, FLAGS.gradient_clip_value) self.worker_summaries = [ summary_conv_act, summary_hidden_act, summary_rnn_act, summary_policy_act, summary_value_act ] for grad, weight in zip(grads, local_vars): self.worker_summaries.append( tf.summary.histogram(weight.name + '_grad', grad)) self.worker_summaries.append( tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(self.worker_summaries) global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): self.prob_of_random_goal = tf.Variable( FLAGS.initial_random_goal_prob, trainable=False, name="prob_of_random_goal", dtype=tf.float32) self.inputs = tf.placeholder(shape=[ None, FLAGS.resized_height, FLAGS.resized_width, FLAGS.agent_history_length ], dtype=tf.float32, name="Inputs") self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards") self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards, dtype=tf.int32), 2, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_rewards = tf.expand_dims(self.prev_rewards, 1, name="rewards") # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0) self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_Goals") self.image_summaries = [] if FLAGS.game not in flags.SUPPORTED_ENVS: self.conv0 = tf.contrib.layers.conv2d(self.inputs, 16, 8, 4, activation_fn=tf.nn.elu, scope="conv0") with tf.variable_scope('conv0'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) self.conv = tf.contrib.layers.conv2d(self.conv0, 32, 4, 2, activation_fn=tf.nn.elu, scope="conv1") else: self.conv = tf.contrib.layers.conv2d(self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1") with tf.variable_scope('conv1'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) with tf.variable_scope('inputs'): tf.get_variable_scope().reuse_variables() self.image_summaries.append( tf.summary.image('input', self.inputs, max_outputs=100)) self.conv_flat = tf.contrib.layers.flatten(self.conv) self.fc = tf.contrib.layers.fully_connected( self.conv_flat, FLAGS.hidden_dim) self.fc = tf.contrib.layers.layer_norm(self.fc) self.f_percept = tf.nn.elu(self.fc, name="Zt") if FLAGS.game not in flags.SUPPORTED_ENVS: self.f_percept = tf.concat([self.f_percept, self.prev_rewards], 1, name="Zt_r") else: self.f_percept = tf.concat( [self.f_percept, self.prev_rewards_onehot], 1, name="Zt_r") summary_f_percept_act = tf.contrib.layers.summarize_activation( self.f_percept) ############################################################################################################ # Manager network if FLAGS.meta: self.f_Mspace = tf.concat([self.f_percept, self.prev_goal], 1, name="Zt_r") else: self.f_Mspace = tf.identity(self.f_percept, name="Zt_r") self.f_Mspace = tf.contrib.layers.fully_connected( self.f_Mspace, FLAGS.hidden_dim) self.f_percept = tf.concat( [self.f_percept, self.prev_actions_onehot], 1, name="Zt_r") self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace) self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St") summary_f_Mspace_act = tf.contrib.layers.summarize_activation( self.f_Mspace) m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in") step_size = tf.shape(self.inputs)[:1] m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( FLAGS.hidden_dim) m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) self.m_state_init = [m_c_init, m_h_init] m_c_in = tf.placeholder( tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_c_in") m_h_in = tf.placeholder( tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_h_in") self.m_state_in = (m_c_in, m_h_in) m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in) m_lstm_outputs, m_lstm_state = self.fast_dlstm( m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon, FLAGS.hidden_dim * FLAGS.manager_horizon) m_lstm_c, m_lstm_h = m_lstm_state self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :]) self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim]) self.normalized_goals = tf.contrib.layers.fully_connected( self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt") summary_goals = tf.contrib.layers.summarize_activation( self.normalized_goals) def randomize_goals(t): t = tf.cast(t, tf.int32) packed_tensors = tf.stack([ tf.random_normal([ FLAGS.hidden_dim, ]), self.normalized_goals[t, :] ]) to_update = tf.cond( tf.less( self.prob_of_random_goal, tf.constant(FLAGS.final_random_goal_prob, dtype=tf.float32)), lambda: tf.cast( tf.multinomial( tf.log([[ self.prob_of_random_goal, tf.subtract(tf.constant(1.0), self. prob_of_random_goal) ]]), 1)[0][0], tf.int32), lambda: tf.constant(1, tf.int32)) resulted_tensor = tf.gather(packed_tensors, to_update) return resulted_tensor self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t), tf.to_float( tf.range(0, step_size[0])), name="random_gt") summary_random_goals = tf.contrib.layers.summarize_activation( self.randomized_goals) self.decrease_prob_of_random_goal = tf.assign_sub( self.prob_of_random_goal, tf.constant( (FLAGS.initial_random_goal_prob - FLAGS.final_random_goal_prob) / FLAGS.explore_steps)) m_fc_value_w = tf.get_variable( "M_Value_W", shape=[FLAGS.hidden_dim, 1], initializer=normalized_columns_initializer(1.0)) self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value") summary_m_value_act = tf.contrib.layers.summarize_activation( self.m_value) ############################################################################################################ # Worker network self.sum_prev_goals = tf.placeholder( shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_c_Goals_sum") w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in") step_size = tf.shape(self.inputs)[:1] w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( FLAGS.goal_embedding_size * FLAGS.nb_actions) w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32) w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32) self.w_state_init = [w_c_init, w_h_init] w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c], name="Wrnn_c_in") w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h], name="Wrnn_h_in") self.w_state_in = (w_c_in, w_h_in) w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in) w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn( w_lstm_cell, w_rnn_in, initial_state=w_state_in, sequence_length=step_size, time_major=False) w_lstm_c, w_lstm_h = w_lstm_state self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :]) Ut = tf.reshape( w_lstm_outputs, [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size], name="Ut") Ut_flat = tf.reshape( w_lstm_outputs, [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size], name="Ut_flat") summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut) goal_encoding = tf.contrib.layers.fully_connected( self.sum_prev_goals, FLAGS.goal_embedding_size, biases_initializer=None, scope="goal_emb") interm_rez = tf.squeeze( tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2) interm_rez = tf.contrib.layers.flatten(interm_rez) self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy") summary_w_policy_act = tf.contrib.layers.summarize_activation( self.w_policy) w_fc_value_w = tf.get_variable( "W_Value_W", shape=[ FLAGS.nb_actions * FLAGS.goal_embedding_size + FLAGS.goal_embedding_size, 1 ], initializer=normalized_columns_initializer(1.0)) self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1), w_fc_value_w, name="W_Value") summary_w_value_act = tf.contrib.layers.summarize_activation( self.w_value) if scope != 'global': self.w_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.m_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.w_intrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) def gather_state_at_horiz(t): t = tf.cast(t, tf.int32) f_Mspace_c = tf.gather( self.f_Mspace, tf.minimum( t + tf.constant(FLAGS.manager_horizon, dtype=tf.int32), step_size[0] - 1)) return f_Mspace_c self.f_Mspace_c = tf.cast(tf.map_fn( lambda t: gather_state_at_horiz(t), tf.to_float(tf.range(0, step_size[0])), name="state_at_horiz"), dtype=tf.float32) self.state_diff = self.f_Mspace_c - self.f_Mspace self.cos_sim_state_diff = self.cosine_distance( tf.stop_gradient(self.state_diff), self.normalized_goals, dim=1) self.m_advantages = self.m_extrinsic_return - tf.stop_gradient( tf.reshape(self.m_value, [-1])) self.goals_loss = -tf.reduce_sum( self.m_advantages * self.cos_sim_state_diff) self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum( tf.square(self.m_extrinsic_return - tf.reshape(self.m_value, [-1]))) self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.responsible_outputs = tf.reduce_sum( self.w_policy * self.actions_onehot, [1]) self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return self.total_return = self.w_extrinsic_return + self.intrinsic_return self.w_advantages = self.total_return - tf.stop_gradient( tf.reshape(self.w_value, [-1])) # Loss functions self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum( tf.square(self.total_return - tf.reshape(self.w_value, [-1]))) self.entropy = -tf.reduce_sum( self.w_policy * tf.log(self.w_policy + 1e-7)) self.w_policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.w_advantages) - self.entropy * FLAGS.beta_e self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, FLAGS.gradient_clip_value) self.worker_summaries = [ summary_f_percept_act, summary_f_Mspace_act, summary_goals, summary_random_goals, summary_m_value_act, summary_wrnn_act, summary_w_policy_act, summary_w_value_act ] for grad, weight in zip(grads, local_vars): self.worker_summaries.append( tf.summary.histogram(weight.name + '_grad', grad)) self.worker_summaries.append( tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(self.worker_summaries) global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))