class ThreadReplay(Thread): def __init__(self, server): super(ThreadReplay, self).__init__() self.setDaemon(True) self.server = server self.exit_flag = False self.replay_buffer = ReplayBuffer(buffer_size=Config.REPLAY_BUFFER_SIZE, \ random_seed=Config.REPLAY_BUFFER_RANDOM_SEED) def update_stats(self): self.server.stats.replay_memory_size.value = self.replay_buffer.size() def run(self): #print("thread started: " + str(self.id)) while not self.exit_flag: # if queue is near empty put a batch there if self.server.replay_q.qsize() < Config.REPLAY_MIN_QUEUE_SIZE: if self.replay_buffer.size() > Config.TRAINING_MIN_BATCH_SIZE: x__, r__, a__, x2__, done__ = \ self.replay_buffer.sample_batch(Config.TRAINING_MIN_BATCH_SIZE) self.server.replay_q.put((x__, r__, a__, x2__, done__)) x_, r_, a_, x2_, done_ = self.server.training_q.get() # replay memory uses experiences individually for i in range(x_.shape[0]): self.replay_buffer.add(x_[i], a_[i], r_[i], done_[i], x2_[i]) self.update_stats() # cleaning self.replay_buffer.clear()
def train(sess, env, args, actor, critic): summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'] + " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph) actor.update_target_network() critic.update_target_network() replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) for i in range(int(args['max_episodes'])): state = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): action = actor.predict([state])[0] state2, reward, done, info = env.step(action) reward = np.sum(reward) / NUM_AGENTS replay_buffer.add(state, action, reward, done, state2) if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # TODO # Calculate targets # target_q = critic.predict_target( # s2_batch, actor.predict_target(s2_batch)) target_q = tf.zeros((1)) # Update the critic given the targets predicted_q_value, _, loss = critic.train(s_batch, a_batch, np.reshape(r_batch, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) actor.update_target_network() critic.update_target_network() replay_buffer.clear() # Log summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: np.mean(r_batch), summary_vars[1]: ep_ave_max_q / float(j + 1), summary_vars[2]: loss }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch), i, (ep_ave_max_q / float(j + 1)))) state = state2 ep_reward += reward if done: break
class DDPG(object): def __init__(self, state_dim, action_dim, action_bounds, gamma=0.99, sess=None): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma self.action_mean = (action_bounds[0] + action_bounds[1]) * 0.5 self.action_scale = (action_bounds[1] - action_bounds[0]) * 0.5 self.batch_size = 5 self.replay_buffer = ReplayBuffer(1000000, state_dim=state_dim, action_dim=action_dim) if sess == None: self.sess = tf.InteractiveSession() else: self.sess = sess self.actor = ActorModel(state_dim, action_dim, self.action_mean, self.action_scale, self.sess) self.critic = CriticModel(state_dim, action_dim, self.sess) self.reset_policy() writer = tf.summary.FileWriter('logs', self.sess.graph) writer.close() def reset_policy(self): tf.global_variables_initializer().run() self.actor.reset_target_model() self.critic.reset_target_model() self.train_idx = 0 self.replay_buffer.clear() def curr_policy(self): return self.actor.get_action def save_model(self, filename='/tmp/model.ckpt'): saver = tf.train.Saver() save_path = saver.save(self.sess, filename) print("Model saved in file: %s" % filename) def load_model(self, filename='/tmp/model.ckpt'): saver = tf.train.Saver() saver.restore(self.sess, filename) print("Model loaded from file: %s" % filename) def update(self, env, get_state, max_iter=1000): state = env.reset() total_reward = 0 rand_process = OrnsteinUhlenbeckProcess(dt=1.0, theta=0.15, sigma=0.2, mu=np.zeros(self.action_dim), x0=np.zeros(self.action_dim)) for i in range(max_iter): # get action action = self.actor.get_action(state) # generate random noise for action action_noise = rand_process.get_next() action += action_noise action = np.clip(action, self.action_mean - self.action_scale, self.action_mean + self.action_scale) # action = np.array([action.squeeze()]) [new_state, reward, done, _] = env.step(action) new_state = np.reshape(new_state, (1, self.state_dim)) self.replay_buffer.insert(state, action, reward, new_state, done) total_reward += reward state = new_state if self.train_idx >= (self.batch_size * 3): sample = self.replay_buffer.sample(self.batch_size) # get target actions target_actions = self.actor.get_target_action( sample['next_state']) target_q_vals = self.critic.get_target_q_val( sample['next_state'], target_actions) disc_return = sample['reward'] + \ self.gamma * target_q_vals.squeeze() * (1.0 - sample['terminal']) # update critic network loss = self.critic.train(sample['state'], sample['action'], disc_return) # get actions grads from critic network action_grads = self.critic.get_action_grads( sample['state'], sample['action'])[0] # update actor network self.actor.train(sample['state'], action_grads) # # update target networks self.actor.update_target_model() self.critic.update_target_model() if done: break self.train_idx += 1 return total_reward
class ActorCritic(object): def __init__(self, env, device): self.s_dim = env.s_dim self.a_dim = env.a_dim self.env_epi_length = env.nT self.device = device self.replay_buffer = ReplayBuffer(env, device, buffer_size=self.env_epi_length, batch_size=self.env_epi_length) self.initial_ctrl = InitialControl(env, device) self.actor_rbfnet = rbf.RBF(self.s_dim, ACTOR_BASIS_NUMBERS, BASIS_FCNS) # (T, S) --> (T, F) self.actor_f_dim = ACTOR_BASIS_NUMBERS self.critic_rbfnet = rbf.RBF(self.s_dim, ACTOR_BASIS_NUMBERS, BASIS_FCNS) # (T, S) --> (T, F) self.critic_f_dim = CRITIC_BASIS_NUMBERS self.actor_mu = np.zeros([self.actor_f_dim, self.a_dim]) # (F, A) self.actor_sigma = np.zeros([self.actor_f_dim, self.a_dim]) # (F, A) self.critic_theta = np.zeros([self.critic_f_dim, 1]) # (F, 1) def ctrl(self, epi, step, x, u): if epi < INITIAL_POLICY_INDEX: a_val = self.initial_ctrl.controller(step, x, u) else: a_val = self.choose_action(epi, step, x) a_val = np.clip(a_val, -2, 2) return a_val def choose_action(self, epi, step, x): if step == 0: actor_phi = self.actor_rbfnet.eval_basis(x) # (1, F) actor_mean = self.compute_actor_mean(actor_phi) actor_var = self.compute_actor_var(actor_phi) self.action_traj = np.random.multivariate_normal( actor_mean[0], actor_var, [self.env_epi_length]).reshape([-1, self.a_dim]) # (T, A) action = self.action_traj[step, :].reshape(1, -1) # (1, A) return action def add_experience(self, epi, *single_expr): x, u, r, x2, term = single_expr self.replay_buffer.add(*[x, u, r, x2, term]) if term: # In on-policy method, clear buffer when episode ends self.train(epi) self.replay_buffer.clear() def learning_rate_schedule(self, epi): self.alpha_amu = LEARNING_RATE / (1 + epi**0.5) self.alpha_asig = LEARNING_RATE / (1 + epi**0.5) self.alpha_c = LEARNING_RATE / (1 + epi**0.5) def compute_actor_mean(self, actor_phi): actor_mean = actor_phi @ self.actor_mu # (1, F) @ (F, A) return actor_mean # return np.clip(actor_mean, -1, 1) def compute_actor_var(self, actor_phi): actor_var = SIGMA * np.diag((np.exp(actor_phi @ self.actor_sigma)**2 + 1E-4)[0]) # (1, F) @ (F, A) --> (A, A) return actor_var # return np.clip(actor_var, -1, 1) def train(self, epi): self.learning_rate_schedule(epi) s_traj, a_traj, r_traj, s2_traj, term_traj = self.replay_buffer.sample_sequence( ) # T-number sequence traj_data = list(zip(s_traj, a_traj, r_traj, s2_traj)) del_actor_mu_sum = 0. del_actor_sigma_sum = 0. del_critic_weight_sum = 0. epi_cost = 0. for single_data in reversed(traj_data): del_critic_weight, td, mc, epi_cost = self.compute_critic_grad( single_data, epi_cost) del_actor_mu, del_actor_sigma = self.compute_actor_grad( single_data) del_actor_mu_sum += del_actor_mu del_actor_sigma_sum += del_actor_sigma del_critic_weight_sum += del_critic_weight del_actor_weight_sum = np.concatenate( [del_actor_mu_sum, del_actor_sigma_sum], axis=0) # Critic update self.critic_theta -= self.alpha_c * del_critic_weight_sum # Actor update - Natural policy gradient # fisher = del_actor_weight_sum @ del_actor_weight_sum.T # try: # fisher_chol = sp.linalg.cholesky(fisher + 1E-4 * np.eye(2 * self.actor_f_dim)) # del_actor_weight = sp.linalg.solve_triangular(fisher_chol, sp.linalg.solve_triangular(fisher_chol.T, del_actor_weight_sum, lower=True)) # [2F, A] # except np.linalg.LinAlgError: # del_actor_weight = np.linalg.inv(fisher + 1E-2 * np.eye(2 * self.actor_f_dim)) @ del_actor_weight_sum # # # self.actor_mu -= self.alpha_amu * del_actor_weight[:self.actor_f_dim] * td # self.actor_sigma -= self.alpha_asig * del_actor_weight[self.actor_f_dim:] * td # Actor update - Advantage actor critic, inf hor self.actor_mu -= self.alpha_amu * del_actor_mu * td self.actor_sigma -= self.alpha_asig * del_actor_sigma * td # # # Actor update - REINFORCE # self.actor_mu -= self.alpha_amu * del_actor_mu_sum * mc # self.actor_sigma -= self.alpha_asig * del_actor_sigma_sum * mc self.actor_mu = np.clip(self.actor_mu, -10, 10) self.actor_sigma = np.clip(self.actor_sigma, -10, 10) self.critic_theta = np.clip(self.critic_theta, -10, 10) print(np.linalg.norm(self.actor_mu), np.linalg.norm(self.actor_sigma), np.linalg.norm(self.critic_theta)) def compute_critic_grad(self, single_data, epi_cost): x, u, r, x2 = [_.reshape([1, -1]) for _ in single_data] critic_phi = self.critic_rbfnet.eval_basis(x) # (1, F) critic_phi_next = self.critic_rbfnet.eval_basis(x2) # (1, F) V_curr = np.clip(critic_phi @ self.critic_theta, 0., 5.) V_next = np.clip(critic_phi_next @ self.critic_theta, 0., 5.) td = r + GAMMA * V_next - V_curr # (1, 1) del_critic_weight = (-critic_phi).T @ td # (F, 1) epi_cost = GAMMA * epi_cost + r mc = epi_cost - V_curr return del_critic_weight, td, mc, epi_cost def compute_actor_grad(self, single_data): x, u, r, x2 = [_.reshape([1, -1]) for _ in single_data] actor_phi = self.actor_rbfnet.eval_basis(x) # (1, F) eps = u - self.compute_actor_mean(actor_phi) # (1, F) @ (F, A) actor_var_inv = np.linalg.inv( self.compute_actor_var(actor_phi)) # (A, A) dlogpi_dmu = actor_phi.T @ eps @ actor_var_inv # (F, 1) @ (1, A) @ (A, A) dlogpi_dsigma = SIGMA * np.repeat(actor_phi, self.a_dim, axis=0).T @ ( eps.T @ eps @ actor_var_inv - np.eye(self.a_dim) ) # (F, A) @ (A, A) return dlogpi_dmu, dlogpi_dsigma
class DQN(object): def __init__(self, state_dim, num_actions, eps_anneal, gamma=0.99, update_freq=100, sess=None): self.state_dim = state_dim self.num_actions = num_actions self.gamma = gamma self.eps_anneal = eps_anneal self.update_freq = update_freq self.batch_size = 64 self.replay_buffer = ReplayBuffer(3000, state_dim=state_dim, action_dim=1) self.__build_model() if sess == None: self.sess = tf.InteractiveSession() else: self.sess = sess self.reset_policy() writer = tf.summary.FileWriter('logs', self.sess.graph) writer.close() def reset_policy(self): tf.global_variables_initializer().run() self.train_idx = 0 self.replay_buffer.clear() self.eps_anneal.reset() def __build_q_func(self, input_var, name, reuse=False): with tf.variable_scope(name, reuse=reuse) as scope: layer1 = tf.contrib.layers.fully_connected( input_var, 32, activation_fn=tf.nn.relu, scope='layer1') layer2 = tf.contrib.layers.fully_connected( layer1, 16, activation_fn=tf.nn.relu, scope='layer2') q_vals = tf.contrib.layers.fully_connected(layer2, self.num_actions, activation_fn=None, scope='q_vals') return q_vals def __build_model(self): # forward model self.states = tf.placeholder(tf.float32, [None, self.state_dim], name='states') self.actions = tf.placeholder(tf.int32, [None], name='actions') self.action_q_vals = self.__build_q_func(self.states, name='action_q_func') self.output_actions = tf.argmax(self.action_q_vals, axis=1, name='output_actions') self.sampled_q_vals = tf.reduce_sum(tf.multiply( self.action_q_vals, tf.one_hot(self.actions, self.num_actions)), 1, name='sampled_q_vals') self.target_q_vals = self.__build_q_func(self.states, name='target_q_func') self.max_q_vals = tf.reduce_max(self.target_q_vals, axis=1, name='max_q_vals') # loss self.rewards = tf.placeholder(tf.float32, [None], name='rewards') self.terminal = tf.placeholder(tf.float32, [None], name='terminal') self.q_vals_next_state = tf.placeholder(tf.float32, [None], name='q_vals_next_state') self.terminal_mask = tf.subtract(1.0, self.terminal) self.disc_return = tf.add(self.rewards, tf.multiply( self.terminal_mask, tf.multiply(self.gamma, self.q_vals_next_state)), name='disc_return') self.td_error = tf.subtract(self.disc_return, self.sampled_q_vals, name='td_error') self.loss = tf.reduce_mean(tf.square(self.td_error), name='loss') self.optimizer = tf.train.RMSPropOptimizer(0.00025).minimize(self.loss) # updating target network var_sort_lambd = lambda x: x.name self.action_q_vars = sorted(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='action_q_func'), key=var_sort_lambd) self.target_q_vars = sorted(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func'), key=var_sort_lambd) update_target_ops = [] for action_q, target_q in zip(self.action_q_vars, self.target_q_vars): update_target_ops.append(target_q.assign(action_q)) self.update_target_ops = tf.group(*update_target_ops, name='update_target_ops') def __update_target_network(self): self.sess.run(self.update_target_ops) def get_action(self, state): sample = np.random.random_sample() if sample > self.eps_anneal.eps: fd = {self.states: np.array([state])} output_action = self.sess.run(self.output_actions, feed_dict=fd) action = np.asscalar(output_action) else: action = np.random.randint(self.num_actions) return action def curr_policy(self): return partial(DQN.get_action, self) def save_model(self, filename='/tmp/model.ckpt'): saver = tf.train.Saver() save_path = saver.save(self.sess, filename) print("Model saved in file: %s" % filename) def load_model(self, filename='/tmp/model.ckpt'): saver = tf.train.Saver() saver.restore(self.sess, filename) print("Model loaded from file: %s" % filename) def update(self, env, get_state, max_iter=1000): state = env.reset() action = self.get_action(state) total_reward = 0 for i in range(max_iter): [new_state, reward, done, _] = env.step(action) total_reward += reward self.replay_buffer.insert(state, action, reward, new_state, done) state = new_state if self.train_idx >= self.batch_size: sample = self.replay_buffer.sample(self.batch_size) # get max q values of next state fd = {self.states: sample['next_state']} max_q_vals = self.sess.run(self.max_q_vals, feed_dict=fd) fd = { self.states: sample['state'], self.actions: sample['action'].squeeze(), self.rewards: sample['reward'], self.terminal: sample['terminal'], self.q_vals_next_state: max_q_vals } loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict=fd) if self.train_idx % self.update_freq == 0: self.__update_target_network() if done: break action = self.get_action(state) self.train_idx += 1 self.eps_anneal.update() return total_reward
def train(sess, args, actor, critic): plt.ion() #开启interactive mode speedmode = 6 madr = 1.4 gapvector = [0] * 16 totalreward = [] le = 10000 options = get_options() if options.nogui: sumoBinary = checkBinary('sumo') else: sumoBinary = checkBinary('sumo-gui') leading = [] summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter( args['summary_dir'] + " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph) actor.update_target_network() critic.update_target_network() replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) for i in range(1200): # print(i) zongreward = 0 locationplot = [] speedplot = [] timeplot = [] traci.start([sumoBinary, "-c", "hello.sumocfg"]) # print('shenme') locationplot = [] speedplot = [] timeplot = [] done = 0 chusudu = 14 for i in range(0, 40): leading.append(0) for i in range(40, 70): leading.append(-1) for i in range(70, 200): leading.append(1) for step in range(100): exist_list = traci.vehicle.getIDList() if len(exist_list) > 0: traci.vehicle.setSpeed(exist_list[0], chusudu) traci.simulationStep() gapvector = [2 * chusudu] * 16 # print(gapvector) traci.vehicle.moveTo('a', 'L4_0', le) traci.vehicle.moveTo('b.0', 'L4_0', le - gapvector[0]) traci.vehicle.moveTo('b.1', 'L4_0', le - sum(gapvector[:2])) traci.vehicle.moveTo('b.2', 'L4_0', le - sum(gapvector[:3])) traci.vehicle.moveTo('b.3', 'L4_0', le - sum(gapvector[:4])) traci.vehicle.moveTo('b.4', 'L4_0', le - sum(gapvector[:5])) traci.vehicle.moveTo('b.5', 'L4_0', le - sum(gapvector[:6])) traci.vehicle.moveTo('b.6', 'L4_0', le - sum(gapvector[:7])) traci.vehicle.moveTo('b.7', 'L4_0', le - sum(gapvector[:8])) traci.vehicle.moveTo('c.0', 'L4_0', le - sum(gapvector[:9])) traci.vehicle.moveTo('c.1', 'L4_0', le - sum(gapvector[:10])) traci.vehicle.moveTo('c.2', 'L4_0', le - sum(gapvector[:11])) traci.vehicle.moveTo('c.3', 'L4_0', le - sum(gapvector[:12])) traci.vehicle.moveTo('c.4', 'L4_0', le - sum(gapvector[:13])) traci.vehicle.moveTo('c.5', 'L4_0', le - sum(gapvector[:14])) traci.vehicle.moveTo('c.6', 'L4_0', le - sum(gapvector[:15])) traci.vehicle.moveTo('c.7', 'L4_0', le - sum(gapvector[:16])) traci.simulationStep() chushiweizhi = [] exist_list = traci.vehicle.getIDList() for xx in exist_list: chushiweizhi.append(traci.vehicle.getPosition(xx)[0]) touche = leading ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): # pjz=0 initialsp = [] state2 = [] state = [] reward = [] # print() xiayimiaosudu = np.clip( traci.vehicle.getSpeed(exist_list[0]) + touche[j], 0, chusudu) traci.vehicle.setSpeed(exist_list[0], xiayimiaosudu) for xx in exist_list: traci.vehicle.setSpeedMode(xx, speedmode) initialsp.append(traci.vehicle.getSpeed(xx)) locationplot.append(traci.vehicle.getPosition(xx)[0] / 1000) speedplot.append(traci.vehicle.getSpeed(xx)) timeplot.append(j) for mm in range(1, NUM_AGENTS + 1): # touchea=exist_list[0] ziji = exist_list[mm] qianche = exist_list[mm - 1] gap = traci.vehicle.getLeader(ziji)[1] zhuangtai1 = (traci.vehicle.getSpeed(qianche) - traci.vehicle.getSpeed(ziji)) / 10 zhuangtai2 = (traci.vehicle.getSpeed(ziji) - 16) / 16 zhuangtai3 = (math.sqrt(max(gap, 0)) - 20) / 20 state.append([zhuangtai1, zhuangtai2, zhuangtai3]) action = actor.predict([state])[0] chaoguo = [0] * NUM_AGENTS for mm in range(1, NUM_AGENTS + 1): ziji = exist_list[mm] qianche = exist_list[mm - 1] zijisudu = traci.vehicle.getSpeed(ziji) qianchesudu = traci.vehicle.getSpeed(qianche) gapa = traci.vehicle.getLeader(ziji)[1] if qianchesudu - 3 < zijisudu: gap = gapa - 5 - zijisudu + max(qianchesudu - 3, 0) if gap < 0: amax = -3 # print(gap) else: # amax=math.sqrt(madr*gap)+sp[i]-sp[i+1]-3 amax = min(gap / 3, math.sqrt( madr * gap)) + qianchesudu - zijisudu - 3 amax = np.clip(amax, -3, 3) else: amax = 3 # ac=np.clip(action[mm-1][0]/10,-3,3) # if pjz==0: # ave=sum(action)/NUM_AGENTS # pjz=1 ac = np.clip(action[mm - 1][0] / 10, -3, 3) # print(j,ave,action,ac) if ac > amax: chaoguo[mm - 1] = 1 # print(action[mm-1][0]) # print(j,mm,ac,amax) nextspeed = traci.vehicle.getSpeed(exist_list[mm]) + min( amax, ac) # nextspeed=traci.vehicle.getSpeed(exist_list[mm])+ac # print(action[mm-1][0]) traci.vehicle.setSpeed(exist_list[mm], nextspeed) traci.simulationStep() # for i in NUM_AGENTS+1): # if i>0 and (po[i]>po[i-1]-5 or po[i]<-10000): # chongtu[i-1]=1 chongtu = [0] * NUM_AGENTS # print(j) for mm in range(1, NUM_AGENTS + 1): ziji = exist_list[mm] qianche = exist_list[mm - 1] # print(traci.vehicle.getPosition(ziji)[0]) if traci.vehicle.getPosition(ziji)[0] < -10000: chongtu[mm - 1] = 1 re = min((traci.vehicle.getAcceleration(ziji))**2 / 9, 1) # print(mm-1,traci.vehicle.getAcceleration(ziji),re) if chongtu[mm - 1] == 0: gap = traci.vehicle.getLeader(ziji)[1] else: gap = 0 if gap > 100: re += gap / 100 # print(mm-1,gap,re) if chaoguo[mm - 1] == 1: re += 1 if chongtu[mm - 1] == 1: re += 5 # print('chaoguo'W) # print(mm-1,chaoguo[mm-1],re) reward.append([1 - re]) done = True state2 = None replay_buffer.add(state, action, reward, done, state2) # print(reward) if replay_buffer.size() > int( args['minibatch_size']) or sum(chongtu) > 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # print(j) # print(chongtu) if j % 33 == 32: predicted_q_value, _, loss = critic.train( s_batch, a_batch, np.reshape(r_batch, (32, NUM_AGENTS, 1))) else: predicted_q_value, _, loss = critic.train( s_batch, a_batch, np.reshape(r_batch, (j % 33 + 1, NUM_AGENTS, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads) actor.update_target_network() critic.update_target_network() # print('xunlianle') replay_buffer.clear() # Log summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: np.mean(r_batch), summary_vars[1]: ep_ave_max_q / float(j + 1), summary_vars[2]: loss }) writer.add_summary(summary_str, i) writer.flush() # print(j,reward,r_batch,np.mean(r_batch)) state = [] reward = [] # print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch), # i, (ep_ave_max_q / float(j + 1)))) zongreward += np.mean(r_batch) print(j, action, chaoguo) if sum(chongtu) > 0: print(traci.vehicle.getIDCount()) print('zhuangle22222222222222222222222222') replay_buffer.clear() traci.close() sys.stdout.flush() # bre=1 break replay_buffer.clear() traci.close() sys.stdout.flush() # print(ave) # if state2!=None: # print(state,action,reward,state2) # print(totalreward,zongreward) print(j, zongreward / 9 - 1) if j > 180: totalreward.append(zongreward / 9 - 1) plt.ion() plt.figure(i * 2 - 1) plt.plot(np.arange(len(totalreward)), totalreward) plt.xlabel('Episode') plt.ylabel('Episode reward') plt.draw() plt.pause(1) plt.close() #越大越好 plt.ion() plt.figure(i * 2) plt.scatter(timeplot, locationplot, c=speedplot, s=10, alpha=0.3) plt.colorbar() plt.xlabel('Time (s)') plt.ylabel('Location (km)') plt.grid(True) plt.show() M8 = np.mat(totalreward) np.savetxt("M8.csv", M8, delimiter=',')