def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): self.probs, self.actions = capacities.policy( self.policy_params, self.inputs) self.action_t = tf.squeeze(self.actions, 1)[0] # self.action_t = tf.Print(self.action_t, data=[self.probs, self.action_t], message="self.probs, self.action_t:") v_scope = tf.VariableScope(reuse=False, name='VValues') with tf.variable_scope(v_scope): vs = capacities.value_f(self.v_params, self.inputs) with tf.control_dependencies([self.probs, vs]): with tf.variable_scope('Training'): stacked_actions = tf.stack([ tf.range(0, tf.shape(self.actions)[0]), tf.squeeze(self.actions, 1) ], 1) self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") self.next_states = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="next_states") self.next_actions = tf.placeholder(tf.int32, shape=[None], name="next_actions") with tf.variable_scope(v_scope, reuse=True): next_vs = tf.squeeze( capacities.value_f(self.v_params, self.next_states), 1) with tf.variable_scope('TargetVs'): target_vs1 = tf.stop_gradient(self.rewards + self.discount * next_vs) target_vs2 = self.rewards stacked_targets = tf.stack([target_vs1, target_vs2], 1) select_targets = tf.stack([ tf.range(0, tf.shape(self.next_states)[0]), tf.cast(self.next_states[:, -1], tf.int32) ], 1) target_vs = tf.gather_nd(stacked_targets, select_targets) log_probs = tf.log( tf.gather_nd(self.probs, stacked_actions)) with tf.control_dependencies([log_probs, target_vs]): self.v_loss = 1 / 2 * tf.reduce_sum( tf.square(target_vs - vs)) v_adam = tf.train.AdamOptimizer(self.v_lr) self.v_global_step = tf.Variable(0, trainable=False, name="v_global_step") self.v_train_op = v_adam.minimize( self.v_loss, global_step=self.v_global_step) td = target_vs - vs self.policy_loss = -tf.reduce_sum( log_probs * tf.stop_gradient(td)) policy_adam = tf.train.AdamOptimizer(self.policy_lr) self.policy_global_step = tf.Variable( 0, trainable=False, name="policy_global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.policy_train_op = policy_adam.minimize( self.policy_loss, global_step=self.policy_global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.policy_loss_plh = tf.placeholder(tf.float32, shape=[]) self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh) self.v_loss_plh = tf.placeholder(tf.float32, shape=[]) self.v_loss_sum_t = tf.summary.scalar('v_loss', self.v_loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): self.probs, self.actions = capacities.policy( self.policy_params, self.inputs) self.action_t = tf.squeeze(self.actions, 1)[0] q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.q_values = capacities.value_f(self.q_params, self.inputs) self.q = self.q_values[0, tf.stop_gradient(self.action_t)] with tf.variable_scope('Training'): stacked_actions = tf.stack([ tf.range(0, tf.shape(self.actions)[0]), tf.squeeze(self.actions, 1) ], 1) qs = tf.gather_nd(self.q_values, stacked_actions) log_probs = tf.log(tf.gather_nd(self.probs, stacked_actions)) self.policy_loss = -tf.reduce_sum( log_probs * tf.stop_gradient(qs)) self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") self.next_states = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="next_states") self.next_actions = tf.placeholder(tf.int32, shape=[None], name="next_actions") with tf.variable_scope(q_scope, reuse=True): next_q_values = capacities.value_f(self.q_params, self.next_states) next_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.next_actions)[0]), self.next_actions ], 1) next_qs = tf.gather_nd(next_q_values, next_stacked_actions) target_qs1 = tf.stop_gradient(self.rewards + self.discount * next_qs) target_qs2 = self.rewards stacked_targets = tf.stack([target_qs1, target_qs2], 1) select_targets = tf.stack([ tf.range(0, tf.shape(self.next_states)[0]), tf.cast(self.next_states[:, -1], tf.int32) ], 1) target_qs = tf.gather_nd(stacked_targets, select_targets) self.q_loss = 1 / 2 * tf.reduce_sum(tf.square(target_qs - qs)) self.loss = self.policy_loss + self.q_scale_lr * self.q_loss adam = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.train_op = adam.minimize(self.loss, global_step=self.global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.policy_loss_plh = tf.placeholder(tf.float32, shape=[]) self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh) self.q_loss_plh = tf.placeholder(tf.float32, shape=[]) self.q_loss_sum_t = tf.summary.scalar('q_loss', self.q_loss_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.q_values = tf.squeeze( capacities.value_f(self.q_params, self.inputs)) self.action_t = capacities.eps_greedy(self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps) self.q_t = self.q_values[self.action_t] fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_q_scope): self.update_fixed_vars_op = capacities.fix_scope(q_scope) with tf.variable_scope('ExperienceReplay'): self.er_inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="ERInputs") self.er_actions = tf.placeholder(tf.int32, shape=[None], name="ERInputs") self.er_rewards = tf.placeholder(tf.float32, shape=[None], name="ERReward") self.er_next_states = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="ERNextState") with tf.variable_scope(q_scope, reuse=True): er_q_values = capacities.value_f(self.q_params, self.er_inputs) er_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.er_actions)[0]), self.er_actions ], 1) er_qs = tf.gather_nd(er_q_values, er_stacked_actions) with tf.variable_scope(fixed_q_scope, reuse=True): er_fixed_next_q_values = capacities.value_f( self.q_params, self.er_next_states) with tf.variable_scope(q_scope, reuse=True): er_next_q_values = capacities.value_f( self.q_params, self.er_next_states) er_next_max_action_t = tf.cast(tf.argmax(er_next_q_values, 1), tf.int32) er_next_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.er_next_states)[0]), er_next_max_action_t ], 1) er_next_qs = tf.gather_nd(er_fixed_next_q_values, er_next_stacked_actions) er_target_qs1 = tf.stop_gradient(self.er_rewards + self.discount * er_next_qs) er_target_qs2 = self.er_rewards er_stacked_targets = tf.stack([er_target_qs1, er_target_qs2], 1) select_targets = tf.stack([ tf.range(0, tf.shape(self.er_next_states)[0]), tf.cast(self.er_next_states[:, -1], tf.int32) ], 1) er_target_qs = tf.gather_nd(er_stacked_targets, select_targets) self.er_loss = 1 / 2 * tf.reduce_sum( tf.square(er_target_qs - er_qs)) er_adam = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.er_train_op = er_adam.minimize( self.er_loss, global_step=self.global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.timestep, self.inc_timestep_op = capacities.counter( "timestep") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.N0_t = tf.constant(self.N0, tf.float32, name='N_0') self.N = tf.Variable(0., dtype=tf.float32, name='N', trainable=False) self.min_eps_t = tf.constant(self.min_eps, tf.float32, name='min_eps') self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.q_values = tf.squeeze( capacities.value_f(self.q_params, self.inputs)) self.action_t = capacities.eps_greedy(self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps) self.q_t = self.q_values[self.action_t] with tf.variable_scope('Training'): self.reward = tf.placeholder(tf.float32, shape=[], name="reward") self.next_state = tf.placeholder( tf.float32, shape=[1, self.observation_space.shape[0] + 1], name="nextState") self.next_action = tf.placeholder(tf.int32, shape=[], name="nextAction") with tf.variable_scope(q_scope, reuse=True): next_q_values = tf.squeeze( capacities.value_f(self.q_params, self.next_state)) target_q1 = tf.stop_gradient(self.reward + self.discount * next_q_values[self.next_action]) target_q2 = self.reward is_done = tf.cast(self.next_state[0, 4], tf.bool) target_q = tf.where(is_done, target_q2, target_q1) with tf.control_dependencies([target_q]): self.loss = 1 / 2 * tf.square(target_q - self.q_t) adam = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.train_op = adam.minimize(self.loss, global_step=self.global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): np.random.seed(self.random_seed) with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder(tf.float32, shape=[None, None, self.policy_params['nb_inputs']], name='inputs') input_shape = tf.shape(self.inputs) dynamic_batch_size, dynamic_num_steps = input_shape[0], input_shape[1] inputs_mat = tf.reshape(self.inputs, [-1, self.policy_params['nb_inputs']]) policy_scope = tf.VariableScope(reuse=False, name='Policy') with tf.variable_scope(policy_scope): probs, actions = capacities.policy(self.policy_params, inputs_mat) self.probs = tf.reshape(probs, [dynamic_batch_size, dynamic_num_steps, self.policy_params['nb_outputs']]) self.actions = tf.reshape(actions, [dynamic_batch_size, dynamic_num_steps, 1]) self.action_t = self.actions[0, 0, 0] critic_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(critic_scope): critic_values_mat = capacities.value_f(self.critic_params, inputs_mat) self.critic_values = tf.reshape(critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']]) fixed_critic_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_critic_scope): self.update_fixed_vars_op = capacities.fix_scope(critic_scope) with tf.variable_scope('Training'): self.expected_rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward") self.mask_plh = tf.placeholder(tf.float32, shape=[None, None, 1], name="mask_plh") batch_size, num_steps = tf.shape(self.actions)[0], tf.shape(self.actions)[1] line_indices = tf.matmul( # Line indice tf.reshape(tf.range(0, batch_size), [-1, 1]) , tf.ones([1, num_steps], dtype=tf.int32) ) column_indices = tf.matmul( # Column indice tf.ones([batch_size, 1], dtype=tf.int32) , tf.reshape(tf.range(0, num_steps), [1, -1]) ) depth_indices = tf.cast(tf.squeeze(self.actions, 2), tf.int32) stacked_actions = tf.stack( [line_indices, column_indices, depth_indices], 2 ) log_probs = tf.expand_dims(tf.log(tf.gather_nd(self.probs, stacked_actions)), 2) self.policy_loss = tf.reduce_mean( - tf.reduce_sum((log_probs * (self.expected_rewards - tf.stop_gradient(self.critic_values))) * self.mask_plh, 1)) adam = tf.train.AdamOptimizer(self.lr) self.train_policy_op = adam.minimize(self.policy_loss) self.rewards = tf.placeholder(tf.float32, shape=[None, None, 1], name="reward") self.next_states = tf.placeholder(tf.float32, shape=[None, None, self.critic_params['nb_inputs']], name="next_states") with tf.variable_scope(fixed_critic_scope, reuse=True): next_states_mat = tf.reshape(self.next_states, [-1, self.critic_params['nb_inputs']]) next_critic_values_mat = capacities.value_f(self.critic_params, next_states_mat) next_critic_values = tf.reshape(next_critic_values_mat, [dynamic_batch_size, dynamic_num_steps, self.critic_params['nb_outputs']]) target_critics1 = tf.stop_gradient(self.rewards + self.discount * next_critic_values) target_critics2 = self.rewards stacked_targets = tf.stack([tf.squeeze(target_critics1, 2), tf.squeeze(target_critics2, 2)], 2) batch_size, num_steps = tf.shape(self.next_states)[0], tf.shape(self.next_states)[1] line_indices = tf.matmul( # Line indice tf.reshape(tf.range(0, batch_size), [-1, 1]) , tf.ones([1, num_steps], dtype=tf.int32) ) column_indices = tf.matmul( # Column indice tf.ones([batch_size, 1], dtype=tf.int32) , tf.reshape(tf.range(0, num_steps), [1, -1]) ) depth_indices = tf.cast(self.next_states[:, :, -1], tf.int32) select_targets = tf.stack( [line_indices, column_indices, depth_indices], 2 ) target_critics = tf.expand_dims(tf.gather_nd(stacked_targets, select_targets), 2) self.critic_loss = 1/2 * tf.reduce_sum(tf.square(target_critics - self.critic_values) * self.mask_plh) adam = tf.train.AdamOptimizer(self.critic_lr) self.global_step = tf.Variable(0, trainable=False, name="global_step", collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) self.train_critic_op = adam.minimize(self.critic_loss, global_step=self.global_step) self.policy_loss_plh = tf.placeholder(tf.float32, shape=[]) self.policy_loss_sum_t = tf.summary.scalar('policy_loss', self.policy_loss_plh) self.critic_loss_plh = tf.placeholder(tf.float32, shape=[]) self.critic_loss_sum_t = tf.summary.scalar('critic_loss', self.critic_loss_plh) # self.loss_plh = tf.placeholder(tf.float32, shape=[]) # self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('av_score', self.score_plh) self.episode_id, self.inc_ep_id_op = capacities.counter("episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph