targets_mb = np.array([each for each in target_Qs_batch]) _, loss, absolute_errors = sess.run( [ DQNetwork.optimizer, DQNetwork.loss, DQNetwork.absolute_errors ], feed_dict={ DQNetwork.inputs_: states_mb, DQNetwork.target_Q: targets_mb, DQNetwork.actions_: actions_mb, DQNetwork.ISWeights_: ISWeights_mb }) # Update priority memory.batch_update(tree_idx, absolute_errors) # Write TF Summaries summary = sess.run(write_op, feed_dict={ DQNetwork.inputs_: states_mb, DQNetwork.target_Q: targets_mb, DQNetwork.actions_: actions_mb, DQNetwork.ISWeights_: ISWeights_mb }) writer.add_summary(summary, episode) writer.flush() if tau > max_tau: # Update the parameters of our TargetNetwork with DQN_weights update_target = update_target_graph()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Memory(BUFFER_SIZE) self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Here we'll deal with the empty memory problem: we pre-populate our memory # by taking random actions and storing the experience. self.tree_idx = None def step(self, state, action, reward, next_state, done): # Save experience in replay memory e = self.experience(state, action, reward, next_state, done) self.memory.store(e) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Obtain random mini-batch from memory self.tree_idx, batch, ISWeights_mb = self.memory.sample(BATCH_SIZE) states = torch.from_numpy(np.vstack([each[0][0] for each in batch ])).float().to(device) actions = torch.from_numpy( np.vstack([each[0][1] for each in batch])).long().to(device) rewards = torch.from_numpy( np.stack([[each[0][2]] for each in batch])).float().to(device) next_states = torch.from_numpy( np.vstack([each[0][3] for each in batch])).float().to(device) dones = torch.from_numpy( np.stack([[each[0][4]] for each in batch ]).astype(np.uint8)).float().to(device) experiences = (states, actions, rewards, next_states, dones) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Choose actions according to local network next_actions = self.qnetwork_local(next_states).argmax(dim=1) # Choose values from target network Q_targets_next = self.qnetwork_target(next_states).detach()[ np.arange(BATCH_SIZE), next_actions].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update memory having the batch loss as priority value batch_loss = np.ones(BATCH_SIZE) * loss.data.cpu().numpy() self.memory.batch_update(self.tree_idx, batch_loss) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQN(object): ''' DQN结构,输入为每一时刻machine的ram值 ''' def __init__( self, n_actions, # 动作数量 n_features, # 每个state所有observation的数量 learning_rate=0.005, reward_decay=0.9, # gamma,奖励衰减值 e_greedy=0.9, # 贪婪值,用来决定是使用贪婪模式还是随机模式 replace_target_iter=500, # Target_Net更行轮次 memory_size=10000, # 记忆库大小 batch_size=32, e_greedy_increment=None, output_graph=False, prioritized=True, # 是否使用优先记忆 sess=None, ): self.n_actions = n_actions self.n_features = n_features self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.target_net_update_period = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max self.prioritized = prioritized # 是否是用优先级记忆 self.global_step_counter = 0 self.build_net() t_params = tf.get_collection('target_net_params') q_params = tf.get_collection('q_net_params') self.update_target_net = [ tf.assign(t, e) for t, e in zip(t_params, q_params) ] if self.prioritized: # 使用SumTree self.memory = Memory( capacity=memory_size) # 构建一个容量为memory size的记忆库 else: # 不使用优先级记忆策略,用一个numpy数组表示记忆 self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) if sess is None: self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) else: self.sess = sess if output_graph: tf.summary.FileWriter("logs/", self.sess.graph) self.cost_his = [] def build_net(self): ''' 创建两个神经网络 :return: ''' self.input_state = tf.placeholder(tf.float32, [None, self.n_features], name='input_state') self.output_target = tf.placeholder(tf.float32, [None, self.n_actions], name='output_target') self.input_weights = tf.placeholder( tf.float32, [None, 1], name='IS_weights') # 每个训练数据在计算loss时的权重 # 构建Q_Net with tf.variable_scope('q_net'): c_names = ['q_net_params', tf.GraphKeys.GLOBAL_VARIABLES] self.q_eval = self.build_layers(self.input_state, c_names, True) # 构建Q_Net的训练loss with tf.variable_scope('loss'): self.abs_errors = tf.reduce_sum(tf.abs(self.output_target - self.q_eval), axis=1) self.loss = tf.reduce_mean( self.input_weights * tf.squared_difference(self.output_target, self.q_eval)) # 构建Q_Net 的训练操作 with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize( self.loss) # 初始化并构建Target_Net self.input_state_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') with tf.variable_scope('target_net'): # 构建Target_Net c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] self.q_next = self.build_layers(self.input_state_, c_names, False) def build_layers(self, s, c_names, trainable): '''构建一个双隐藏层神经网络''' w_initializer, b_initializer = tf.random_normal_initializer( 0., 0.3), tf.constant_initializer(0.1) with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, hidden_size], initializer=w_initializer, collections=c_names, trainable=trainable) b1 = tf.get_variable('b1', [1, hidden_size], initializer=b_initializer, collections=c_names, trainable=trainable) l1 = tf.nn.relu(tf.matmul(s, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [hidden_size, self.n_actions], initializer=w_initializer, collections=c_names, trainable=trainable) b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names, trainable=trainable) out = tf.matmul(l1, w2) + b2 return out # 将从环境中获得的记忆数据存储到DQN的记忆库中 def store_transition(self, s, a, r, s_): transition = np.hstack((s, [a, r], s_)) # 将数据转换成array self.memory.store(transition) def choose_action(self, observation): '''根据输入的state选择行为,90%的概率选择最优行为,10%概率随机''' observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: actions_value = self.sess.run( self.q_eval, feed_dict={self.input_state: observation}) action = np.argmax(actions_value) else: action = np.random.randint(0, self.n_actions) return action def learn(self): if self.global_step_counter % self.target_net_update_period == 0: self.sess.run(self.update_target_net) tree_idx, batch_memory, memory_weights = self.memory.sample( self.batch_size) feed = { self.input_state_: batch_memory[:, -self.n_features:], self.input_state: batch_memory[:, :self.n_features] } q_next, q_eval = self.sess.run([self.q_next, self.q_eval], feed_dict=feed) # 正向传播Q_Net和Target_Net # 只计算被选择的ation对应的loss,其他action产生的loss记为0 output_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = batch_memory[:, self.n_features].astype(int) reward = batch_memory[:, self.n_features + 1] output_target[ batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) # 获得本次训练的loss,进而将其更新到SumTree中 feed = { self.input_state: batch_memory[:, :self.n_features], self.output_target: output_target, self.input_weights: memory_weights } _, abs_errors, self.cost = self.sess.run( [self._train_op, self.abs_errors, self.loss], feed_dict=feed) self.memory.batch_update(tree_idx, abs_errors) # 将训练过的记忆数据更新到SumTree中 self.cost_his.append(self.cost) self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.global_step_counter += 1
class DQN2(object): ''' DQN结构,该DQN输入数据为大小为(210, 160, 3)的图片,使用CNN结构 ''' def __init__( self, n_actions, # 动作数量 9 image_shape, # 输入image的大小(210, 160, 3) learning_rate=0.005, reward_decay=0.9, # gamma,奖励衰减值 e_greedy=0.9, # 贪婪值,用来决定是使用贪婪模式还是随机模式 replace_target_iter=500, # Target_Net更行轮次 memory_size=10000, # 记忆库大小 batch_size=32, e_greedy_increment=None, output_graph=False, prioritized=True, # 是否使用优先记忆 sess=None, ): self.n_actions = n_actions self.image_shape = image_shape.insert(0, None) self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.target_net_update_period = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max self.prioritized = prioritized # 是否是用优先级记忆 self.global_step_counter = 0 self.build_net() t_params = tf.get_collection('target_net_params') q_params = tf.get_collection('q_net_params') self.update_target_net = [ tf.assign(t, e) for t, e in zip(t_params, q_params) ] if self.prioritized: # 使用SumTree self.memory = Memory( capacity=memory_size) # 构建一个容量为memory size的记忆库 else: # 不使用优先级记忆策略,用一个numpy数组表示记忆 self.memory = np.zeros((self.memory_size, image_shape * 2 + 2)) if sess is None: self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) else: self.sess = sess if output_graph: tf.summary.FileWriter("logs/", self.sess.graph) self.cost_his = [] def build_net(self): ''' 创建两个神经网络 :return: ''' self.input_state = tf.placeholder(tf.float32, self.image_shape, name='input_state') self.output_target = tf.placeholder(tf.float32, [None, self.n_actions], name='output_target') self.input_weights = tf.placeholder( tf.float32, [None, 1], name='IS_weights') # 每个训练数据在计算loss时的权重 # 构建Q_Net with tf.variable_scope('q_net'): c_names = ['q_net_params', tf.GraphKeys.GLOBAL_VARIABLES] self.q_eval = self.build_layers(self.input_state, c_names, True) # 构建Q_Net的训练loss with tf.variable_scope('loss'): self.abs_errors = tf.reduce_sum(tf.abs(self.output_target - self.q_eval), axis=1) self.loss = tf.reduce_mean( self.input_weights * tf.squared_difference(self.output_target, self.q_eval)) # 构建Q_Net 的训练操作 with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize( self.loss) # 初始化并构建Target_Net self.input_state_ = tf.placeholder(tf.float32, self.image_shape, name='s_') with tf.variable_scope('target_net'): # 构建Target_Net c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] self.q_next = self.build_layers(self.input_state_, c_names, False) def build_layers(self, s, c_names, trainable): ''' 构建一个包含两个卷积层,两个最大池化层,两个全连接层的CNN ''' # TODO: 修改每个weights的大小 w_initializer = tf.random_normal_initializer(0., 0.3) b_initializer = tf.constant_initializer(0.1) weights = { 'conv1': tf.get_variable('conv_w1', shape=[4, 4, 3, 6], initializer=w_initializer, collections=c_names, trainable=trainable), 'conv2': tf.get_variable('conv_w2', shape=[4, 4, 6, 12], initializer=w_initializer, collections=c_names, trainable=trainable), 'h1': tf.get_variable('h_w1', shape=hidden_size, initializer=w_initializer, collections=c_names, trainable=trainable), 'h2': tf.get_variable('h_w2', shape=hidden_size, initializer=w_initializer, collections=c_names, trainable=trainable) } biases = { 'conv1': tf.get_variable('conv_b1', shape=self.image_shape, initializer=b_initializer, collections=c_names, trainable=trainable), 'conv2': tf.get_variable('conv_b2', shape=self.image_shape, initializer=b_initializer, collections=c_names, trainable=trainable), 'h1': tf.get_variable('h_b1', shape=hidden_size, initializer=b_initializer, collections=c_names, trainable=trainable), 'h2': tf.get_variable('h_b2', shape=hidden_size, initializer=b_initializer, collections=c_names, trainable=trainable) } with tf.variable_scope('conv_1'): conv1_layer = tf.nn.conv2d(s, weights['conv1'], strides=[1, 1, 1, 1], padding='SAME') pool1_layer = tf.nn.max_pool(conv1_layer, ksize=[2, 2]) relu1_layer = tf.nn.relu(pool1_layer) + biases['conv1'] with tf.variable_scope('conv_2'): conv2_layer = tf.nn.conv2d(relu1_layer, weights['conv1'], strides=[1, 1, 1, 1], padding='SAME') pool2_layer = tf.nn.max_pool(conv2_layer, ksize=[2, 2]) relu2_layer = tf.nn.relu(pool2_layer) + biases['conv1'] with tf.variable_scope('hidden_1'): padding_layer = tf.reshape(relu2_layer, shape=[self.batch_size, -1]) h1_layer = tf.matmul(padding_layer, weights['h1']) + biases['h1'] h1_layer = tf.nn.relu(h1_layer) with tf.variable_scope('hidden_2'): out = tf.matmul(h1_layer, weights['h2']) + biases['h2'] return out # 将从环境中获得的记忆数据存储到DQN的记忆库中 def store_transition(self, s, a, r, s_): transition = np.hstack((s, [a, r], s_)) # 将数据转换成array self.memory.store(transition) def choose_action(self, observation): '''根据输入的state选择行为,90%的概率选择最优行为,10%概率随机''' observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: actions_value = self.sess.run( self.q_eval, feed_dict={self.input_state: observation}) action = np.argmax(actions_value) else: action = np.random.randint(0, self.n_actions) return action def learn(self): if self.global_step_counter % self.target_net_update_period == 0: self.sess.run(self.update_target_net) tree_idx, batch_memory, memory_weights = self.memory.sample( self.batch_size) feed = { self.input_state_: batch_memory[:, -self.image_shape:], self.input_state: batch_memory[:, :self.image_shape] } q_next, q_eval = self.sess.run([self.q_next, self.q_eval], feed_dict=feed) # 正向传播Q_Net和Target_Net # 只计算被选择的ation对应的loss,其他action产生的loss记为0 output_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = batch_memory[:, self.image_shape].astype(int) reward = batch_memory[:, self.image_shape + 1] output_target[ batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) # 获得本次训练的loss,进而将其更新到SumTree中 feed = { self.input_state: batch_memory[:, :self.image_shape], self.output_target: output_target, self.input_weights: memory_weights } _, abs_errors, self.cost = self.sess.run( [self._train_op, self.abs_errors, self.loss], feed_dict=feed) self.memory.batch_update(tree_idx, abs_errors) # 将训练过的记忆数据更新到SumTree中 self.cost_his.append(self.cost) self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.global_step_counter += 1
class Agent: def __init__(self, input_dim, output_dim, lr, gamma, tau, buffer_size, l1_units, l2_units, l3_units, rnd_seed): self.buffer_size = buffer_size self.memory = Memory(self.buffer_size) self.input_dim = input_dim self.output_dim = output_dim self.actions = range(output_dim) self.gamma = gamma self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.learning_rate = lr self.tau = tau self.l1_units = l1_units self.l2_units = l2_units self.l3_units = l3_units random.seed(rnd_seed) self.model, self.init_weights = self.create_model() self.target_model, self.target_init_weights = self.create_model() def xplr(self): self.epsilon = (self.epsilon_min-1)/self.epsilon_const*self.epi+1 self.epsilon = max(self.epsilon, self.epsilon_min) self.epi += 1 def create_model(self): model = Sequential() model.add(Dense(self.l1_units, input_dim = self.input_dim, activation="relu")) model.add(Dense(self.l2_units, activation="relu")) model.add(Dense(self.l3_units, activation="relu")) model.add(Dense(self.output_dim)) model.compile(loss=huber_loss, optimizer=Adam(lr=self.learning_rate)) init_weights = model.get_weights() return model, init_weights def act(self, state): self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon_min, self.epsilon) if np.random.random() < self.epsilon: return np.random.choice(self.actions) return self.model.predict(state)[0] def remember(self, state, action, reward, new_state, done): experience = state, action, reward, new_state, done self.memory.store(experience) def replay(self): batch_size = 32 states = [] targets = [] TD_errors = [] tree_idx, batch, ISWeights_mb = self.memory.sample(batch_size) states_mb = np.array([each[0][0] for each in batch]) actions_mb = np.array([each[0][1] for each in batch]) rewards_mb = np.array([each[0][2] for each in batch]) next_states_mb = np.array([each[0][3] for each in batch]) dones_mb = np.array([each[0][4] for each in batch]) for q in range(batch_size): state, action, reward, next_state, done = states_mb[q], actions_mb[q], rewards_mb[q], next_states_mb[q], dones_mb[q] target = self.target_model.predict(state) if done: TD_target = reward/100 else: Q_future = max(self.target_model.predict(next_state)[0]) TD_target = np.clip(reward/100 + Q_future * self.gamma, -1, 0) TD_error = TD_target - target[0][action] TD_errors.append(TD_error) target[0][action] = TD_target states.append(state[0]) targets.append(target[0]) states = np.array(states) targets = np.array(targets) self.model.fit(states, targets, epochs=1, verbose=0) self.memory.batch_update(tree_idx, np.abs(TD_errors)) def target_train(self): weights = self.model.get_weights() target_weights = self.target_model.get_weights() for i in range(len(target_weights)): target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau) self.target_model.set_weights(target_weights) def reset_weights(self, reset_weights): if reset_weights: self.model.set_weights(self.init_weights) self.target_model.set_weights(self.target_init_weights) self.memory = Memory(self.buffer_size) self.epsilon = 1.0
class Agent(): def __init__(self, sess, n_features, config, dic_traffic_env_conf, demo=None, lr=0.01): self.sess = sess self.config = config self._activation_fn = tf.nn.leaky_relu self.dic_traffic_env_conf = dic_traffic_env_conf # replay buffer self.replay_memory = Memory(capacity=self.config.replay_buffer_size, permanent_data=len(demo)) # self.replay_memory = None self.demo_memory = Memory(capacity=self.config.demo_buffer_size, permanent_data=self.config.demo_buffer_size) self.add_demo_to_memory(demo_transitions=demo) self.state_dim = 16 self.action_dim = 8 self.s = tf.placeholder(tf.float32, [None, n_features], "state") self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next") self.q_a_ = tf.placeholder(tf.float32, [None, 1], "q_next") self.r = tf.placeholder(tf.float32, [None, 1], 'r') self.a = tf.placeholder(tf.int32, [None, 1], 'act') self.act_probs = tf.placeholder(tf.float32, [None, 8], 'act_probs') self.action_batch = tf.placeholder("int32", [None]) self.y_input = tf.placeholder("float", [None, self.action_dim]) self.ISWeights = tf.placeholder("float", [None, 1]) self.n_step_y_input = tf.placeholder( "float", [None, self.action_dim]) # for n-step reward self.isdemo = tf.placeholder("float", [None]) self.td = tf.placeholder(tf.float32, [None, 1], "td_error") # TD_error self.expert_action = tf.placeholder(tf.float32, [None, 8], "expert_action") self.hidden = self.construct_forward(self.s, True, 'None', True, "hidden", prefix='fc') with tf.variable_scope('Q-Value'): self.q = tf.layers.dense( inputs=self.hidden, units=8, # number of hidden units activation=None, kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_initializer=tf.constant_initializer(0.1), # biases name='Q') with tf.variable_scope('Q-Target'): self.q_target = tf.layers.dense( inputs=self.hidden, units=8, # number of hidden units activation=None, kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_initializer=tf.constant_initializer(0.1), # biases name='Q-Target') with tf.variable_scope('Actor'): self.probs = tf.layers.dense( inputs=self.hidden, units=8, # output units activation=tf.nn.softmax, # get action probabilities kernel_initializer=tf.random_normal_initializer(0., .1), # weights bias_initializer=tf.constant_initializer(0.1), # biases name='acts_prob') # self.v = self.build_net("Value") # self.q = self.construct_forward(self.s, True, 'None', True, "Q-Value", prefix='fc') # self.q_target = self.construct_forward(self.s, True, 'None', True, "Q-Target", prefix='fc') # self.q = self.build_q_net("Q-Value") # self.q_target = self.build_q_net("Q-Target") self.q_a = tf.batch_gather(self.q, self.a) self.v = tf.reduce_sum(self.q * self.act_probs, axis=1, keepdims=True) # self.v_target = self.build_net("Target") self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Q-Target') self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Q-Value') self.replace_target_op = [ tf.assign(t, p) for t, p in zip(self.t_params, self.params) ] self.loss self.optimize self.update_target_net self.abs_errors self.time_step = 0 with tf.variable_scope('squared_TD_error'): # self.td_error = self.r + 0.8 * self.v_ - self.v self.td_error = self.q_a - self.v q_loss = tf.reduce_mean( tf.squared_difference(self.q_a, self.r + 0.8 * self.q_a_)) # v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.r + 0.8 * self.v_)) # self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval # self.loss = q_loss + v_loss self.los = q_loss with tf.variable_scope('train-c'): self.train_op_critic = tf.train.AdamOptimizer(lr).minimize( self.los) with tf.variable_scope('exp_v'): # log_prob = tf.log(self.acts_prob[0, self.a]) log_prob = tf.log(tf.batch_gather(self.probs, self.a)) self.exp_v = tf.reduce_mean( log_prob * self.td) # advantage (TD_error) guided loss self.action = gumbel_softmax(logits=self.probs, temperature=1, hard=False) with tf.variable_scope('train-a'): self.train_op_actor = tf.train.AdamOptimizer(lr).minimize( -self.exp_v) # minimize(-exp_v) = maximize(exp_v) self.pretrain_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.action, labels=self.expert_action) self.pretrain_op = tf.train.AdamOptimizer(lr).minimize( self.pretrain_loss) def add_demo_to_memory(self, demo_transitions): # add demo data to both demo_memory & replay_memory for t in demo_transitions: self.demo_memory.store(np.array(t, dtype=object)) self.replay_memory.store(np.array(t, dtype=object)) assert len(t) == 10 # use the expert-demo-data to pretrain def pre_train(self): print('Pre-training ...') for i in range(self.config.PRETRAIN_STEPS): self.train_Q_network(pre_train=True) if i % 200 == 0 and i > 0: print('{} th step of pre-train finish ...'.format(i)) self.time_step = 0 print('All pre-train finish.') @lazy_property def abs_errors(self): return tf.reduce_sum(tf.abs(self.y_input - self.q), axis=1) # only use 1-step R to compute abs_errors @lazy_property def optimize(self): optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE) return optimizer.minimize( self.loss) # only parameters in select-net is optimized here @lazy_property def update_target_net(self): select_params = tf.get_collection('Q-Value') eval_params = tf.get_collection('Q-Target') return [tf.assign(e, s) for e, s in zip(eval_params, select_params)] def learn_critic(self, s, r, s_, a, next_a, probs): s, s_, r = s[np.newaxis, :], s_[np.newaxis, :], r[np.newaxis, :] a, next_a = a[np.newaxis, :], next_a[np.newaxis, :] # v_ = self.sess.run(self.v, {self.s: s_}) q_a_ = self.sess.run(self.q_a, {self.s: s_, self.a: next_a}) td_error, _ = self.sess.run( [self.td_error, self.train_op_critic], { self.s: s, self.r: r, self.act_probs: probs, self.q_a_: q_a_, self.a: a }) return td_error def loss_l(self, ae, a): return 0.0 if ae == a else 0.8 def loss_jeq(self, q): jeq = 0.0 for i in range(self.config.BATCH_SIZE): ae = self.action_batch[i] max_value = float("-inf") for a in range(self.action_dim): max_value = tf.maximum(q[i][a] + self.loss_l(ae, a), max_value) jeq += self.isdemo[i] * (max_value - q[i][ae]) return jeq @lazy_property def loss(self): l_dq = tf.reduce_mean(tf.squared_difference(self.q, self.y_input)) l_n_dq = tf.reduce_mean( tf.squared_difference(self.q, self.n_step_y_input)) l_jeq = self.loss_jeq(self.q) l_l2 = tf.reduce_sum([ tf.reduce_mean(reg_l) for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) ]) return self.ISWeights * tf.reduce_sum([ l * λ for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA) ]) def train_Q_network(self, pre_train=False, update=True): """ :param pre_train: True means should sample from demo_buffer instead of replay_buffer :param update: True means the action "update_target_net" executes outside, and can be ignored in the function """ if not pre_train and not self.replay_memory.full( ): # sampling should be executed AFTER replay_memory filled return self.time_step += 1 assert self.replay_memory.full() or pre_train actual_memory = self.demo_memory if pre_train else self.replay_memory tree_idxes, minibatch, ISWeights = actual_memory.sample( self.config.BATCH_SIZE) np.random.shuffle(minibatch) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] demo_data = [data[5] for data in minibatch] n_step_reward_batch = [data[6] for data in minibatch] n_step_state_batch = [data[7] for data in minibatch] n_step_done_batch = [data[8] for data in minibatch] actual_n = [data[9] for data in minibatch] # provide for placeholder,compute first q_next = self.q.eval(feed_dict={self.s: next_state_batch}) q_target_next = self.q_target.eval( feed_dict={self.s: next_state_batch}) n_step_q_next = self.q.eval(feed_dict={self.s: n_step_state_batch}) n_step_q_target_next = self.q_target.eval( feed_dict={self.s: n_step_state_batch}) y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) # td_error_batch = np.zeros((self.config.BATCH_SIZE, 1)) for i in range(self.config.BATCH_SIZE): # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t temp = self.q.eval( feed_dict={ self.s: state_batch[i].reshape((-1, self.state_dim)) })[0] # v = np.sum(temp, action_prob_batch[i]) # td_error_batch[i] = temp[action_batch[i]] - v temp_0 = np.copy(temp) # add 1-step reward action = np.argmax(q_next[i]) # action = next_action_batch[i] temp[action_batch[i]] = reward_batch[i] + (1 - int( done_batch[i])) * self.config.GAMMA * q_target_next[i][action] y_batch[i] = temp # add n-step reward action = np.argmax(n_step_q_next[i]) q_n_step = ( 1 - int(n_step_done_batch[i])) * self.config.GAMMA**actual_n[ i] * n_step_q_target_next[i][action] temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step n_step_y_batch[i] = temp_0 _, abs_errors = self.sess.run( [self.optimize, self.abs_errors], feed_dict={ self.y_input: y_batch, self.n_step_y_input: n_step_y_batch, self.s: state_batch, self.action_batch: action_batch, self.isdemo: demo_data, self.ISWeights: ISWeights }) self.replay_memory.batch_update( tree_idxes, abs_errors) # update priorities for data in memory # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制 if update and self.time_step % self.config.UPDATE_TARGET_NET == 0: self.sess.run(self.update_target_net) return state_batch, action_batch def learn_actor(self, s, a, td): s = s[np.newaxis, :] a = a[np.newaxis, :] # td = td[np.newaxis, :] feed_dict = {self.s: s, self.a: a, self.td: td} _, exp_v = self.sess.run([self.train_op_actor, self.exp_v], feed_dict) return exp_v def choose_action(self, s): s = s[np.newaxis, :] probs = self.sess.run(self.probs, {self.s: s}) # 获取所有操作的概率 return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()), probs # return a int def pretrain(self, state, action): print("Pre-training for Actor!") expert_action_batch = np.zeros((self.batch_size, 8)) for i, a in enumerate(action): expert_action_batch[i, a] = 1 self.sess.run(self.pretrain_op, { self.s: state, self.expert_action: expert_action_batch }) def contruct_layer(self, inp, activation_fn, reuse, norm, is_train, scope): if norm == 'batch_norm': out = tf.contrib.layers.batch_norm(inp, activation_fn=activation_fn, reuse=reuse, is_training=is_train, scope=scope) elif norm == 'None': out = activation_fn(inp) else: ValueError('Can\'t recognize {}'.format(norm)) return out def construct_weights(self): weights = {} weights['embed_w1'] = tf.Variable( tf.glorot_uniform_initializer()([1, 4]), name='embed_w1') weights['embed_b1'] = tf.Variable(tf.zeros([4]), name='embed_b1') # for phase, one-hot weights['embed_w2'] = tf.Variable(tf.random_uniform_initializer( minval=-0.05, maxval=0.05)([2, 4]), name='embed_w2') #weights['embed_b2'] = tf.Variable(tf.zeros([4]), name='embed_b2') # lane embeding weights['lane_embed_w3'] = tf.Variable( tf.glorot_uniform_initializer()([8, 16]), name='lane_embed_w3') weights['lane_embed_b3'] = tf.Variable(tf.zeros([16]), name='lane_embed_b3') # relation embeding, one-hot weights['relation_embed_w4'] = tf.Variable( tf.random_uniform_initializer(minval=-0.05, maxval=0.05)([2, 4]), name='relation_embed_w4') #weights['relation_embed_b4'] = tf.Variable(tf.zeros([4]), name='relation_embed_b4') weights['feature_conv_w1'] = tf.Variable( tf.glorot_uniform_initializer()([1, 1, 32, 20]), name='feature_conv_w1') weights['feature_conv_b1'] = tf.Variable(tf.zeros([20]), name='feature_conv_b1') weights['phase_conv_w1'] = tf.Variable( tf.glorot_uniform_initializer()([1, 1, 4, 20]), name='phase_conv_w1') weights['phase_conv_b1'] = tf.Variable(tf.zeros([20]), name='phase_conv_b1') weights['combine_conv_w1'] = tf.Variable( tf.glorot_uniform_initializer()([1, 1, 20, 20]), name='combine_conv_w1') weights['combine_conv_b1'] = tf.Variable(tf.zeros([20]), name='combine_conv_b1') weights['final_conv_w1'] = tf.Variable( tf.glorot_uniform_initializer()([1, 1, 20, 1]), name='final_conv_w1') weights['final_conv_b1'] = tf.Variable(tf.zeros([1]), name='final_conv_b1') return weights def construct_forward(self, inp, reuse, norm, is_train, scope, prefix='fc'): # embedding, only for 4 or 8 phase, hard code for lane_num_vehicle + cur_phase with tf.variable_scope(scope): weights = self.construct_weights() dim = int(inp.shape[1].value / 2) num_veh = inp[:, :dim] num_veh = tf.reshape(num_veh, [-1, 1]) phase = inp[:, dim:] phase = tf.cast(phase, tf.int32) phase = tf.one_hot(phase, 2) phase = tf.reshape(phase, [-1, 2]) embed_num_veh = self.contruct_layer( tf.matmul(num_veh, weights['embed_w1']) + weights['embed_b1'], activation_fn=tf.nn.sigmoid, reuse=reuse, is_train=is_train, norm=norm, scope='num_veh_embed.' + prefix) embed_num_veh = tf.reshape(embed_num_veh, [-1, dim, 4]) embed_phase = self.contruct_layer(tf.matmul( phase, weights['embed_w2']), activation_fn=tf.nn.sigmoid, reuse=reuse, is_train=is_train, norm=norm, scope='phase_embed.' + prefix) embed_phase = tf.reshape(embed_phase, [-1, dim, 4]) dic_lane = {} for i, m in enumerate(self.dic_traffic_env_conf["LANE_PHASE_INFO"] ["start_lane"]): dic_lane[m] = tf.concat( [embed_num_veh[:, i, :], embed_phase[:, i, :]], axis=-1) list_phase_pressure = [] phase_startLane_mapping = self.dic_traffic_env_conf[ "LANE_PHASE_INFO"]["phase_sameStartLane_mapping"] for phase in self.dic_traffic_env_conf["LANE_PHASE_INFO"]["phase"]: t1 = tf.Variable(tf.zeros(1)) t2 = tf.Variable(tf.zeros(1)) for lane in phase_startLane_mapping[phase][0]: t1 += self.contruct_layer( tf.matmul(dic_lane[lane], weights['lane_embed_w3']) + weights['lane_embed_b3'], activation_fn=self._activation_fn, reuse=reuse, is_train=is_train, norm=norm, scope='lane_embed.' + prefix) t1 /= len(phase_startLane_mapping[phase][0]) if len(phase_startLane_mapping[phase]) >= 2: for lane in phase_startLane_mapping[phase][1]: t2 += self.contruct_layer( tf.matmul(dic_lane[lane], weights['lane_embed_w3']) + weights['lane_embed_b3'], activation_fn=self._activation_fn, reuse=reuse, is_train=is_train, norm=norm, scope='lane_embed.' + prefix) t2 /= len(phase_startLane_mapping[phase][1]) list_phase_pressure.append(t1 + t2) # TODO check batch_size here constant = relation(self.dic_traffic_env_conf["LANE_PHASE_INFO"]) constant = tf.one_hot(constant, 2) s1, s2 = constant.shape[1:3] constant = tf.reshape(constant, (-1, 2)) relation_embedding = tf.matmul(constant, weights['relation_embed_w4']) relation_embedding = tf.reshape(relation_embedding, (-1, s1, s2, 4)) list_phase_pressure_recomb = [] num_phase = len(list_phase_pressure) for i in range(num_phase): for j in range(num_phase): if i != j: list_phase_pressure_recomb.append( tf.concat([ list_phase_pressure[i], list_phase_pressure[j] ], axis=-1, name="concat_compete_phase_%d_%d" % (i, j))) list_phase_pressure_recomb = tf.concat(list_phase_pressure_recomb, axis=-1, name="concat_all") feature_map = tf.reshape(list_phase_pressure_recomb, (-1, num_phase, num_phase - 1, 32)) #if num_phase == 8: # feature_map = tf.reshape(list_phase_pressure_recomb, (-1, 8, 7, 32)) #else: # feature_map = tf.reshape(list_phase_pressure_recomb, (-1, 4, 3, 32)) lane_conv = tf.nn.conv2d( feature_map, weights['feature_conv_w1'], [1, 1, 1, 1], 'VALID', name='feature_conv') + weights['feature_conv_b1'] lane_conv = tf.nn.leaky_relu(lane_conv, name='feature_activation') # relation conv layer relation_conv = tf.nn.conv2d( relation_embedding, weights['phase_conv_w1'], [1, 1, 1, 1], 'VALID', name='phase_conv') + weights['phase_conv_b1'] relation_conv = tf.nn.leaky_relu(relation_conv, name='phase_activation') combine_feature = tf.multiply(lane_conv, relation_conv, name="combine_feature") # second conv layer hidden_layer = tf.nn.conv2d(combine_feature, weights['combine_conv_w1'], [1, 1, 1, 1], 'VALID', name='combine_conv') + \ weights['combine_conv_b1'] hidden_layer = tf.nn.leaky_relu(hidden_layer, name='combine_activation') before_merge = tf.nn.conv2d(hidden_layer, weights['final_conv_w1'], [1, 1, 1, 1], 'VALID', name='final_conv') + \ weights['final_conv_b1'] before_merge = tf.nn.leaky_relu(before_merge, name='combine_activation') #if self.num_actions == 8: # _shape = (-1, 8, 7) #else: # _shape = (-1, 4, 3) _shape = (-1, 8, 7) before_merge = tf.reshape(before_merge, _shape) out = tf.reduce_sum(before_merge, axis=2) return out
class DQfD: def __init__(self, env, config, demo_transitions=None): self.sess = tf.InteractiveSession() self.config = config # replay_memory stores both demo data and generated data, while demo_memory only store demo data self.replay_memory = Memory(capacity=self.config.replay_buffer_size, permanent_data=len(demo_transitions)) self.demo_memory = Memory(capacity=self.config.demo_buffer_size, permanent_data=self.config.demo_buffer_size) self.add_demo_to_memory( demo_transitions=demo_transitions ) # add demo data to both demo_memory & replay_memory self.time_step = 0 self.epsilon = self.config.INITIAL_EPSILON self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.action_batch = tf.placeholder("int32", [None]) self.y_input = tf.placeholder("float", [None, self.action_dim]) self.ISWeights = tf.placeholder("float", [None, 1]) self.n_step_y_input = tf.placeholder( "float", [None, self.action_dim]) # for n-step reward self.isdemo = tf.placeholder("float", [None]) self.eval_input = tf.placeholder("float", [None, self.state_dim]) self.select_input = tf.placeholder("float", [None, self.state_dim]) self.Q_eval self.Q_select self.loss self.optimize self.update_target_net self.abs_errors self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) self.save_model() self.restore_model() def add_demo_to_memory(self, demo_transitions): # add demo data to both demo_memory & replay_memory for t in demo_transitions: self.demo_memory.store(np.array(t, dtype=object)) self.replay_memory.store(np.array(t, dtype=object)) assert len(t) == 10 # use the expert-demo-data to pretrain def pre_train(self): print('Pre-training ...') for i in range(self.config.PRETRAIN_STEPS): self.train_Q_network(pre_train=True) if i % 200 == 0 and i > 0: print('{} th step of pre-train finish ...'.format(i)) self.time_step = 0 print('All pre-train finish.') # TODO: How to add the variable created in tf.layers.dense to the customed collection? # def build_layers(self, state, collections, units_1, units_2, w_i, b_i, regularizer=None): # with tf.variable_scope('dese1'): # dense1 = tf.layers.dense(tf.contrib.layers.flatten(state), activation=tf.nn.relu, units=units_1, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # with tf.variable_scope('dens2'): # dense2 = tf.layers.dense(dense1, activation=tf.nn.relu, units=units_2, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # with tf.variable_scope('dene3'): # dense3 = tf.layers.dense(dense2, activation=tf.nn.relu, units=self.action_dim, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # return dense3 def build_layers(self, state, c_names, units_1, units_2, w_i, b_i, reg=None): a_d = self.action_dim with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.state_dim, units_1], initializer=w_i, collections=c_names, regularizer=reg) b1 = tf.get_variable('b1', [1, units_1], initializer=b_i, collections=c_names, regularizer=reg) dense1 = tf.nn.relu(tf.matmul(state, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [units_1, units_2], initializer=w_i, collections=c_names, regularizer=reg) b2 = tf.get_variable('b2', [1, units_2], initializer=b_i, collections=c_names, regularizer=reg) dense2 = tf.nn.relu(tf.matmul(dense1, w2) + b2) with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [units_2, a_d], initializer=w_i, collections=c_names, regularizer=reg) b3 = tf.get_variable('b3', [1, a_d], initializer=b_i, collections=c_names, regularizer=reg) dense3 = tf.matmul(dense2, w3) + b3 return dense3 @lazy_property def Q_select(self): with tf.variable_scope('select_net') as scope: c_names = ['select_net_params', tf.GraphKeys.GLOBAL_VARIABLES] w_i = tf.random_uniform_initializer(-0.1, 0.1) b_i = tf.constant_initializer(0.1) reg = tf.contrib.layers.l2_regularizer( scale=0.2) # Note: only parameters in select-net need L2 return self.build_layers(self.select_input, c_names, 24, 24, w_i, b_i, reg) @lazy_property def Q_eval(self): with tf.variable_scope('eval_net') as scope: c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES] w_i = tf.random_uniform_initializer(-0.1, 0.1) b_i = tf.constant_initializer(0.1) return self.build_layers(self.eval_input, c_names, 24, 24, w_i, b_i) def loss_l(self, ae, a): return 0.0 if ae == a else 0.8 def loss_jeq(self, Q_select): jeq = 0.0 for i in range(self.config.BATCH_SIZE): ae = self.action_batch[i] max_value = float("-inf") for a in range(self.action_dim): max_value = tf.maximum(Q_select[i][a] + self.loss_l(ae, a), max_value) jeq += self.isdemo[i] * (max_value - Q_select[i][ae]) return jeq @lazy_property def loss(self): l_dq = tf.reduce_mean( tf.squared_difference(self.Q_select, self.y_input)) l_n_dq = tf.reduce_mean( tf.squared_difference(self.Q_select, self.n_step_y_input)) # l_n_step_dq = self.loss_n_step_dq(self.Q_select, self.n_step_y_input) l_jeq = self.loss_jeq(self.Q_select) l_l2 = tf.reduce_sum([ tf.reduce_mean(reg_l) for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) ]) return self.ISWeights * tf.reduce_sum([ l * λ for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA) ]) @lazy_property def abs_errors(self): return tf.reduce_sum(tf.abs(self.y_input - self.Q_select), axis=1) # only use 1-step R to compute abs_errors @lazy_property def optimize(self): optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE) return optimizer.minimize( self.loss) # only parameters in select-net is optimized here @lazy_property def update_target_net(self): select_params = tf.get_collection('select_net_params') eval_params = tf.get_collection('eval_net_params') return [tf.assign(e, s) for e, s in zip(eval_params, select_params)] def save_model(self): print("Model saved in : {}".format( self.saver.save(self.sess, self.config.MODEL_PATH))) def restore_model(self): self.saver.restore(self.sess, self.config.MODEL_PATH) print("Model restored.") def perceive(self, transition): self.replay_memory.store(np.array(transition)) # epsilon->FINAL_EPSILON(min_epsilon) if self.replay_memory.full(): self.epsilon = max(self.config.FINAL_EPSILON, self.epsilon * self.config.EPSILIN_DECAY) def train_Q_network(self, pre_train=False, update=True): """ :param pre_train: True means should sample from demo_buffer instead of replay_buffer :param update: True means the action "update_target_net" executes outside, and can be ignored in the function """ if not pre_train and not self.replay_memory.full( ): # sampling should be executed AFTER replay_memory filled return self.time_step += 1 assert self.replay_memory.full() or pre_train actual_memory = self.demo_memory if pre_train else self.replay_memory tree_idxes, minibatch, ISWeights = actual_memory.sample( self.config.BATCH_SIZE) np.random.shuffle(minibatch) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] demo_data = [data[5] for data in minibatch] n_step_reward_batch = [data[6] for data in minibatch] n_step_state_batch = [data[7] for data in minibatch] n_step_done_batch = [data[8] for data in minibatch] actual_n = [data[9] for data in minibatch] # provide for placeholder,compute first Q_select = self.Q_select.eval( feed_dict={self.select_input: next_state_batch}) Q_eval = self.Q_eval.eval( feed_dict={self.eval_input: next_state_batch}) n_step_Q_select = self.Q_select.eval( feed_dict={self.select_input: n_step_state_batch}) n_step_Q_eval = self.Q_eval.eval( feed_dict={self.eval_input: n_step_state_batch}) y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) for i in range(self.config.BATCH_SIZE): # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t temp = self.Q_select.eval( feed_dict={ self.select_input: state_batch[i].reshape((-1, self.state_dim)) })[0] temp_0 = np.copy(temp) # add 1-step reward action = np.argmax(Q_select[i]) temp[action_batch[i]] = reward_batch[i] + ( 1 - int(done_batch[i])) * self.config.GAMMA * Q_eval[i][action] y_batch[i] = temp # add n-step reward action = np.argmax(n_step_Q_select[i]) q_n_step = ( 1 - int(n_step_done_batch[i]) ) * self.config.GAMMA**actual_n[i] * n_step_Q_eval[i][action] temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step n_step_y_batch[i] = temp_0 _, abs_errors = self.sess.run( [self.optimize, self.abs_errors], feed_dict={ self.y_input: y_batch, self.n_step_y_input: n_step_y_batch, self.select_input: state_batch, self.action_batch: action_batch, self.isdemo: demo_data, self.ISWeights: ISWeights }) self.replay_memory.batch_update( tree_idxes, abs_errors) # update priorities for data in memory # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制 if update and self.time_step % self.config.UPDATE_TARGET_NET == 0: self.sess.run(self.update_target_net) def egreedy_action(self, state): if random.random() <= self.epsilon: return random.randint(0, self.action_dim - 1) return np.argmax( self.Q_select.eval(feed_dict={self.select_input: [state]})[0])