def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.N0_t = tf.constant(self.N0, tf.float32, name='N_0') self.N = tf.Variable(0., dtype=tf.float32, name='N', trainable=False) self.min_eps_t = tf.constant(self.min_eps, tf.float32, name='min_eps') self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.q_values = tf.squeeze( capacities.value_f(self.q_params, self.inputs)) self.action_t = capacities.eps_greedy(self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps) self.q_t = self.q_values[self.action_t] with tf.variable_scope('Training'): self.reward = tf.placeholder(tf.float32, shape=[], name="reward") self.next_state = tf.placeholder( tf.float32, shape=[1, self.observation_space.shape[0] + 1], name="nextState") self.next_action = tf.placeholder(tf.int32, shape=[], name="nextAction") with tf.variable_scope(q_scope, reuse=True): next_q_values = tf.squeeze( capacities.value_f(self.q_params, self.next_state)) target_q1 = tf.stop_gradient(self.reward + self.discount * next_q_values[self.next_action]) target_q2 = self.reward is_done = tf.cast(self.next_state[0, 4], tf.bool) target_q = tf.where(is_done, target_q2, target_q1) with tf.control_dependencies([target_q]): self.loss = 1 / 2 * tf.square(target_q - self.q_t) adam = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.train_op = adam.minimize(self.loss, global_step=self.global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph
def build_graph(self, graph): with graph.as_default(): tf.set_random_seed(self.random_seed) self.inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name='inputs') q_scope = tf.VariableScope(reuse=False, name='QValues') with tf.variable_scope(q_scope): self.q_values = tf.squeeze( capacities.value_f(self.q_params, self.inputs)) self.action_t = capacities.eps_greedy(self.inputs, self.q_values, self.env.action_space.n, self.N0, self.min_eps) self.q_t = self.q_values[self.action_t] fixed_q_scope = tf.VariableScope(reuse=False, name='FixedQValues') with tf.variable_scope(fixed_q_scope): self.update_fixed_vars_op = capacities.fix_scope(q_scope) with tf.variable_scope('ExperienceReplay'): self.er_inputs = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="ERInputs") self.er_actions = tf.placeholder(tf.int32, shape=[None], name="ERInputs") self.er_rewards = tf.placeholder(tf.float32, shape=[None], name="ERReward") self.er_next_states = tf.placeholder( tf.float32, shape=[None, self.observation_space.shape[0] + 1], name="ERNextState") with tf.variable_scope(q_scope, reuse=True): er_q_values = capacities.value_f(self.q_params, self.er_inputs) er_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.er_actions)[0]), self.er_actions ], 1) er_qs = tf.gather_nd(er_q_values, er_stacked_actions) with tf.variable_scope(fixed_q_scope, reuse=True): er_fixed_next_q_values = capacities.value_f( self.q_params, self.er_next_states) with tf.variable_scope(q_scope, reuse=True): er_next_q_values = capacities.value_f( self.q_params, self.er_next_states) er_next_max_action_t = tf.cast(tf.argmax(er_next_q_values, 1), tf.int32) er_next_stacked_actions = tf.stack([ tf.range(0, tf.shape(self.er_next_states)[0]), er_next_max_action_t ], 1) er_next_qs = tf.gather_nd(er_fixed_next_q_values, er_next_stacked_actions) er_target_qs1 = tf.stop_gradient(self.er_rewards + self.discount * er_next_qs) er_target_qs2 = self.er_rewards er_stacked_targets = tf.stack([er_target_qs1, er_target_qs2], 1) select_targets = tf.stack([ tf.range(0, tf.shape(self.er_next_states)[0]), tf.cast(self.er_next_states[:, -1], tf.int32) ], 1) er_target_qs = tf.gather_nd(er_stacked_targets, select_targets) self.er_loss = 1 / 2 * tf.reduce_sum( tf.square(er_target_qs - er_qs)) er_adam = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable( 0, trainable=False, name="global_step", collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.er_train_op = er_adam.minimize( self.er_loss, global_step=self.global_step) self.score_plh = tf.placeholder(tf.float32, shape=[]) self.score_sum_t = tf.summary.scalar('score', self.score_plh) self.loss_plh = tf.placeholder(tf.float32, shape=[]) self.loss_sum_t = tf.summary.scalar('loss', self.loss_plh) self.all_summary_t = tf.summary.merge_all() self.episode_id, self.inc_ep_id_op = capacities.counter( "episode_id") self.timestep, self.inc_timestep_op = capacities.counter( "timestep") # Playing part self.pscore_plh = tf.placeholder(tf.float32, shape=[]) self.pscore_sum_t = tf.summary.scalar('play_score', self.pscore_plh) return graph