def setOptimizer(self, name): # optimize with tf.variable_scope('optimizer_' + name): self.targets = tf.placeholder('float32', [None], name='target_q_t') self.actions = tf.placeholder('int64', [None], name='action') self.beta = tf.placeholder('float32', [None], name='beta') action_one_hot = tf.one_hot(self.actions, config.NUMBER_OF_ACTIONS, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.current_network.outputs * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.targets - q_acted self.delta = tf.where(tf.greater(self.delta, tf.constant(0.0)), self.delta, self.delta*self.beta) with tf.name_scope('loss'): self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss') self.optim = tf.train.AdamOptimizer(config.LEARNING_RATE).minimize(self.loss)
def _build_optim(self): with tf.variable_scope('optimizer'): # we can evaluate this seperately since we dont have to propagate errors # fed in using r+gammaQ_t(s', argmax Q(s',a')) self.yDQN = tf.placeholder('float32', [None], name = 'yDQN') # find true q for action batch self.action = tf.placeholder('int32', [None], name = 'action') # batch, features, depth action_one_hot = tf.one_hot(self.action, self.action_size, axis = -1) # get q values for the action we chose, mask self.q with element wise mult # -> q for each batch q_for_step = tf.reduce_sum(tf.mul(self.q_train, action_one_hot), 1) # get loss from TD self.loss = clipped_error(self.yDQN-q_for_step) # optimize #self.optim = tf.train.RMSPropOptimizer(0.0015, momentum = 0.90, epsilon = 1e-08).minimize(self.loss) self.optim = tf.train.AdamOptimizer().minimize(self.loss)
def __init__(self, env, env_name, sess=tf.InteractiveSession()): self.env = env self.env_name = env_name self.model_dir = self.env_name + "/" self.sess = sess self.replay_buffer = deque() self.state_dim = env.observation_space.shape self.height = 84 self.width = 84 self.action_dim = env.action_space.n self.hidden_dim = HIDDEN_UNITS self.ep_start = ep_start self.ep_end = ep_end self.ep_end_t = ep_end_t self.episodes = episodes self.episode_steps = steps self.gamma = GAMMA self.batch_size = BATCH_SIZE self.replay_buffer_size = REPLAY_BUFFER_SIZE self.epsilon = EPSILON with tf.variable_scope('step'): self.step_op = tf.Variable(0, trainable=False, name='step') self.step_input = tf.placeholder('int32', None, name='step_input') self.step_assign_op = self.step_op.assign(self.step_input) # create model self.initializer = tf.truncated_normal_initializer(0, 0.02) self.activation_fn = tf.nn.relu self.w = {} # model layers with tf.variable_scope('prediction'): # state tf.image.crop_and_resize() self.state_input = tf.placeholder('float32', (None, ) + self.state_dim, name='s_t') self.state = tf.image.resize_images(self.state_input, [84, 64]) self.state = tf.image.pad_to_bounding_box(self.state, 0, 10, self.height, self.width) # cnn layers self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d( self.state, 32, [8, 8], [4, 4], initializer=self.initializer, activation_fn=self.activation_fn, name='l1') self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d( self.l1, 32, [4, 4], [2, 2], initializer=self.initializer, activation_fn=self.activation_fn, name='l2') self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d( self.l2, 32, [3, 3], [1, 1], initializer=self.initializer, activation_fn=self.activation_fn, name='l3') shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape( self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) # # fc layers self.l4, self.w['l4_w'], self.w['l4_b'] = linear( self.l3_flat, self.hidden_dim, activation_fn=self.activation_fn, name='l4') self.q, self.w['l5_w'], self.w['l5_b'] = linear(self.l4, self.action_dim, name='q') # policy evaluation using max action self.q_action = tf.argmax(self.q, dimension=1) q_summary = [] avg_q = tf.reduce_mean(self.q, 0) # 对多个batch的q值求平均 for idx in range(self.action_dim): q_summary.append(tf.summary.histogram('q/%s' % idx, avg_q[idx])) self.q_summary = tf.summary.merge(q_summary, 'q_summary') # optimizer with tf.variable_scope('optimizer'): # input action (one hot) self.action_one_hot = tf.placeholder("float", [None, self.action_dim]) ### input action (not one hot) # self.action_not_one_hot = tf.placeholder('int64', [None], name='action') ### action one hot # self.action_one_hot = tf.one_hot(self.action_not_one_hot, self.env.action_size, 1.0, 0.0, name='action_one_hot') # predicted q value, action is one hot representation self.predicted_q = tf.reduce_sum(tf.multiply( self.q, self.action_one_hot), reduction_indices=1, name='q_acted') # true value self.y = tf.placeholder("float", [None]) # error self.delta = self.y - self.predicted_q # clipped loss function self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss') self.global_step = tf.Variable(0, trainable=False) self.learning_rate = learning_rate self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_decay_step = learning_rate_decay_step self.learning_rate_decay = learning_rate_decay self.learning_rate_minimum = learning_rate_minimum self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) with tf.variable_scope('summary'): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder( 'float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = tf.summary.scalar( "%s/%s" % (self.env_name, tag), self.summary_placeholders[tag]) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder( 'float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = tf.summary.histogram( tag, self.summary_placeholders[tag]) self.writer = tf.summary.FileWriter('./logs/%s' % self.model_dir, self.sess.graph) self.sess.run(tf.global_variables_initializer()) self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op], max_to_keep=30)
def build_dqn(self): self.w = {} self.t_w = {} #initializer = tf.contrib.layers.xavier_initializer() initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu # training network with tf.variable_scope('prediction'): if self.cnn_format == 'NHWC': self.s_t = tf.placeholder('float32', [None, self.screen_height, self.screen_width, self.history_length], name='s_t') else: self.s_t = tf.placeholder('float32', [None, self.history_length, self.screen_height, self.screen_width], name='s_t') self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='l1') self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='l2') self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='l3') shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape(self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) if self.dueling: self.value_hid, self.w['l4_val_w'], self.w['l4_val_b'] = \ linear(self.l3_flat, 512, activation_fn=activation_fn, name='value_hid') self.adv_hid, self.w['l4_adv_w'], self.w['l4_adv_b'] = \ linear(self.l3_flat, 512, activation_fn=activation_fn, name='adv_hid') self.value, self.w['val_w_out'], self.w['val_w_b'] = \ linear(self.value_hid, 1, name='value_out') self.advantage, self.w['adv_w_out'], self.w['adv_w_b'] = \ linear(self.adv_hid, self.env.action_size, name='adv_out') # Average Dueling self.q = self.value + (self.advantage - tf.reduce_mean(self.advantage, reduction_indices=1, keep_dims=True)) else: self.l4, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat, 512, activation_fn=activation_fn, name='l4') self.q, self.w['q_w'], self.w['q_b'] = linear(self.l4, self.env.action_size, name='q') self.q_action = tf.argmax(self.q, dimension=1) q_summary = [] avg_q = tf.reduce_mean(self.q, 0) for idx in range(self.env.action_size): q_summary.append(tf.summary.histogram('q/%s' % idx, avg_q[idx])) self.q_summary = tf.summary.merge(q_summary, 'q_summary') # target network with tf.variable_scope('target'): if self.cnn_format == 'NHWC': self.target_s_t = tf.placeholder('float32', [None, self.screen_height, self.screen_width, self.history_length], name='target_s_t') else: self.target_s_t = tf.placeholder('float32', [None, self.history_length, self.screen_height, self.screen_width], name='target_s_t') self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='target_l1') self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='target_l2') self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='target_l3') shape = self.target_l3.get_shape().as_list() self.target_l3_flat = tf.reshape(self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) if self.dueling: self.t_value_hid, self.t_w['l4_val_w'], self.t_w['l4_val_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_value_hid') self.t_adv_hid, self.t_w['l4_adv_w'], self.t_w['l4_adv_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_adv_hid') self.t_value, self.t_w['val_w_out'], self.t_w['val_w_b'] = \ linear(self.t_value_hid, 1, name='target_value_out') self.t_advantage, self.t_w['adv_w_out'], self.t_w['adv_w_b'] = \ linear(self.t_adv_hid, self.env.action_size, name='target_adv_out') # Average Dueling self.target_q = self.t_value + (self.t_advantage - tf.reduce_mean(self.t_advantage, reduction_indices=1, keep_dims=True)) else: self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_l4') self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \ linear(self.target_l4, self.env.action_size, name='target_q') self.target_q_idx = tf.placeholder('int32', [None, None], 'outputs_idx') self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx) with tf.variable_scope('pred_to_target'): self.t_w_input = {} self.t_w_assign_op = {} for name in self.w.keys(): self.t_w_input[name] = tf.placeholder('float32', self.t_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.t_w[name].assign(self.t_w_input[name]) # optimizer with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder('float32', [None], name='target_q_t') self.action = tf.placeholder('int64', [None], name='action') action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss') self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum(self.learning_rate_minimum, tf.train.exponential_decay( self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optim = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope('summary'): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = tf.summary.scalar("%s-%s/%s" % (self.env_name, self.env_type, tag), self.summary_placeholders[tag]) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = tf.summary.histogram(tag, self.summary_placeholders[tag]) self.writer = tf.summary.FileWriter('./logs/%s' % self.model_dir, self.sess.graph) tf.initialize_all_variables().run() # print('self.w.values()',self.w.values()) self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op], max_to_keep=30) self.load_model() self.update_target_q_network()
def build_model(self): # model layers with tf.variable_scope(self.model_name + "_" + 'prediction'): self.state = tf.placeholder('float32', [None, 84, 84, 3], name='s_t') # input action (one hot) self.action_one_hot = tf.placeholder("float", [None, self.action_dim]) self.next_state = tf.placeholder('float32', (None, 84, 84, 3), name='s_t_1') self.reward = tf.placeholder('float32', (None, ), name='reward') self.done = tf.placeholder('int32', (None, ), name='done') self.times = tf.placeholder('float32', (None, ), name='timesteps') # cnn layers self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d( self.state, 5, [2, 2], [1, 1], initializer=self.initializer, activation_fn=self.activation_fn, name='l1', reuse=False) self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d( self.l1, 10, [3, 3], [1, 1], initializer=self.initializer, activation_fn=self.activation_fn, name='l2', reuse=False) self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d( self.l2, 10, [3, 3], [1, 1], initializer=self.initializer, activation_fn=self.activation_fn, name='l3', reuse=False) shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape( self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) # fc layers self.q, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat, self.action_dim, name='q', reuse=False) # optimizer with tf.variable_scope(self.model_name + "_" + 'optimizer'): # predicted q value, action is one hot representation # 预测值q self.predicted_q = tf.boolean_mask(self.q, self.action_one_hot) # 用pi0和pii的q值计算v # pi0 = self.shared_policy.select_action(self.next_state) self.pi0_prob = tf.placeholder(tf.float32, [None, self.action_dim], name="pi0") self.next_q = tf.placeholder(tf.float32, [None, self.action_dim], name="next_q") self.v = tf.log( tf.pow(self.pi0_prob, self.alpha) * tf.exp(self.beta * self.next_q)) / self.beta # 目标值(true value) self.y = [] for i in range(self.batch_size): if self.done[i] != 0: self.y.append(self.reward[i]) else: self.y.append(self.reward[i] + self.gamma * self.v[i]) # error self.delta = self.y - self.predicted_q # clipped loss function self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss') self.global_step = tf.Variable(0, dtype=tf.int64, trainable=False) self.learning_rate = learning_rate self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_decay_step = learning_rate_decay_step self.learning_rate_decay = learning_rate_decay self.learning_rate_minimum = learning_rate_minimum self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss)
def build_dqn(self): self.w = {} # weights self.t_w = {} # target weights initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu with tf.variable_scope('prediction'): if self.state_format == 'NHWC': self.s_t = tf.placeholder('float32', [ None, self.screen_height, self.screen_width, self.history_length ], name='s_t') else: self.s_t = tf.placeholder('float32', [ None, self.history_length, self.screen_height, self.screen_width ], name='s_t') self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d( self.s_t, 32, [8, 8], [4, 4], initializer=initializer, activation_fn=activation_fn, data_format=self.state_format, name='l1') self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d( self.l1, 32, [8, 8], [4, 4], initializer=initializer, activation_fn=activation_fn, data_format=self.state_format, name='l2') self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d( self.l2, 32, [8, 8], [4, 4], initializer=initializer, activation_fn=activation_fn, data_format=self.state_format, name='l3') shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape( self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) # self.l4, self.w['l4_w'], self.w['l4_b'] = linear( self.l3_flat, 512, activation_fn=activation_fn, name='l4') self.l5, self.w['l5_w'], self.w['l5_b'] = linear( self.l4, self.env.action_size, name='q') # policy evaluation using max action self.q_action = tf.argmax(self.l5, dimension=1) q_summary = [] avg_q = tf.reduce_mean(self.q, 0) # 对多个batch的q值求平均 for idx in range(self.env.action_size): q_summary.append(tf.summary.histogram('q/%s' % idx, avg_q[idx])) self.q_summary = tf.summary.merge(q_summary, 'q_summary') # target network with tf.variable_scope('target'): if self.state_format == 'NHWC': self.target_s_t = tf.placeholder('float32', [ None, self.screen_height, self.screen_width, self.history_length ], name='target_s_t') else: self.target_s_t = tf.placeholder('float32', [ None, self.history_length, self.screen_height, self.screen_width ], name='target_s_t') self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d( self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.state_format, name='target_l1') self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d( self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.state_format, name='target_l2') self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d( self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.state_format, name='target_l3') shape = self.target_l3.get_shape().as_list() self.target_l3_flat = tf.reshape( self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_l4') self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \ linear(self.target_l4, self.env.action_size, name='target_q') with tf.variable_scope('pred_to_target'): self.t_w_input = {} self.t_w_assign_op = {} for name in self.t_w.keys(): self.t_w_input[name] = tf.placeholder( 'float32', self.t_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.t_w[name].assign( self.t_w_input[name]) with tf.variable_scope('optimizer'): self.target_q_t = tf.placeholder( 'float32', [None], name='target_q_t') # target q at time t # self.q_action.eval(s_t) self.action = tf.placeholder('int64', [None], name='action') action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted # target q - true q self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss') self.learning_rate_step = tf.placeholder('int64', None, name='lr_rate_step')
def build(self): self.w = {} self.t_w = {} initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu with tf.variable_scope('prediction'): if (self.cnn_format == 'NHWC'): self.s_t = tf.placeholder('float32', [ None, self.screen_height, self.screen_width, self.history_length ], name='s_t') else: self.s_t = tf.placeholder('float32', [ None, self.history_length, self.screen_height, self.screen_width ], name='s_t') self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='l1') self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='l2') self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='l3') shape = self.l3.get_shape().as_list() self.l3_flat = tf.reshape( self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) self.l4, self.w['l4_w'], self.w['l4_b'] = linear( self.l3_flat, 512, activation_fn=activation_fn, name='l4') self.q, self.w['q_w'], self.w['q_b'] = linear(self.l4, self.action_size, name='q') self.q_action = tf.argmax(self.q, dimension=1) with tf.variable_scope('target'): if (self.cnn_format == 'NHWC'): self.target_s_t = tf.placeholder('float32', [ None, self.screen_height, self.screen_width, self.history_length ], name='target_s_t') else: self.target_s_t = tf.placeholder('float32', [ None, self.history_length, self.screen_height, self.screen_width ], name='target_s_t') self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d( self.target_s_t, 32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='target_l1') self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d( self.target_l1, 64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='target_l2') self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d( self.target_l2, 64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='target_l3') shape = self.target_l3.get_shape().as_list() self.target_l3_flat = tf.reshape( self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])]) self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \ linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_l4') self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \ linear(self.target_l4, self.action_size, name='target_q') self.target_q_idx = tf.placeholder('int32', [None, None], 'outputs_idx') self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx) with tf.variable_scope('pred_to_target'): self.t_w_input = {} self.t_w_assign_op = {} for name in self.w.keys(): self.t_w_input[name] = tf.placeholder( 'float32', self.t_w[name].get_shape().as_list(), name=name) self.t_w_assign_op[name] = self.t_w[name].assign( self.t_w_input[name]) with tf.variable_scope('optimiser'): self.target_q_t = tf.placeholder('float32', [None], name='target_q_t') self.action = tf.placeholder('int64', [None], name='action') action_one_hot = tf.one_hot(self.action, self.action_size, 1., 0., name='action_one_hot') q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') self.delta = self.target_q_t - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss') self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step') self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize( self.loss) self.sess.run(tf.global_variables_initializer()) #self._saver = tf.train.Saver(self.w.values() + [self.step_op], max_to_keep=30) #self.load_model() self.update_target_q_network()