def main(): # env = envstandalone.GhostEvade() env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # exploration_fraction=0.2 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq = 1 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,2) # deicticShape = (3,3,4) deicticShape = (4, 4, 2) # deicticShape = (4,4,4) # deicticShape = (8,8,2) # num_deictic_patches = 36 num_deictic_patches = 25 # num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp( # convs=[(16,4,1)], convs=[(16, 3, 1)], # convs=[(16,2,1)], hiddens=[16], dueling=True) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version return U.BatchInput(deicticShape, name=name) # # MLP version # return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func") getqTarget = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target") update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), scope="deepq", qscope="q_func") getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) # obsDeictic = getDeic([obs])[:,:,:,0:2] # CNN version qCurr = getq(np.array(obsDeictic)) # # MLP version # qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # Get curr, next values: CNN version qNextTarget = getqTarget(obses_tp1_deic) qNext = getq(obses_tp1_deic) qCurr = getq(obses_t_deic) # # Get curr, next values: MLP version # qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # CNN version td_error_out, obses_deic_out, targets_out = targetTrain( obses_t_deic, qCurrTargets) # # MLP version # td_error_out, obses_deic_out, targets_out = targetTrain( # np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), # qCurrTargets # ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def train(sess, env, args, actor, critic, actor_noise): def eval_reward(env, actor, max_episode_len, episode_i): #evaluate actor network without noise ep_num = 10 ep_reward = 0 done_count = 0 for i in range(ep_num): # s=env.reset_to_value(rad_unit*i) s = env.reset() for k in range(max_episode_len): a = actor.predict_target(np.reshape(s, (1, actor.s_dim))) s2, r, terminal = env.step(a[0]) ep_reward += r if terminal: done_count += 1 break s = s2 ep_reward //= ep_num done_rate = done_count / ep_num # print('Episodic Reward: %d, Elapsed time: %.4f' % (int(ep_reward),elapsed)) print('[eval]episode: %d,Episodic Reward: %d, done rate: %.2f' % (episode_i, ep_reward, done_rate)) return ep_reward, done_rate def save_reward(lst, done_lst, args): base_dir = args['rewards_dir'] time_stamp = time.strftime('%m%d-%H%M%S') base_dir = os.path.join(base_dir, time_stamp) if not os.path.exists(base_dir): os.makedirs(base_dir) save_file_name = os.path.join(base_dir, 'rwd.dat') file = open(save_file_name, 'wb') pickle.dump(lst, file, 1) save_file_name = os.path.join(base_dir, 'done.dat') file = open(save_file_name, 'wb') pickle.dump(max(done_lst), file, 1) # plt.plot(lst) # plt.title(time_stamp) # plt.xlabel('Episodes') # plt.ylabel('Average Reward') # plt.ylim([-300,0]) fig_name = os.path.join(base_dir, 'reward_fig.png') # plt.savefig(fig_name) print('Rewards sucessfully writed!') sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) reward_list = [] done_list = [] saver = tf.train.Saver() max_eval_rwd = -10000 for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break print('[train]episode %d, reward %d' % (i, ep_reward)) if (i + 1) % 10 == 0: eval_r, done_rate = eval_reward(env, actor, int(args['max_episode_len']), i) reward_list.append(eval_r), done_list.append(done_rate) if args['save_model']: if eval_r > max_eval_rwd and eval_r > 1000: actor.save_weights() critic.save_weights() save_reward(reward_list, done_list, args)
def train(sess, env, args, actor, critic, actor_noise, reward_result, agent): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. tflearn.is_training(True) paths = list() for i in range(int(args['max_episodes'])): #Utilize GP from previous iteration while training current iteration if (agent.firstIter == 1): pass else: agent.GP_model_prev = agent.GP_model.copy() dynamics_gp.build_GP_model(agent) for el in range(5): obs, action, rewards, action_bar, action_BAR = [], [], [], [], [] s1 = env.reset() s = np.copy(s1) ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): #env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape( s, (1, actor.s_dim))) + actor_noise() #Incorporate barrier function action_rl = a[0] #Utilize compensation barrier function if (agent.firstIter == 1): u_BAR_ = [0] #u_BAR_ = agent.bar_comp.get_action(s)[0] else: u_BAR_ = [0] #u_BAR_ = agent.bar_comp.get_action(s)[0] action_RL = action_rl + u_BAR_ t = 0.05 * j #Utilize safety barrier function if (agent.firstIter == 1): [f, g, x, std ] = dynamics_gp.get_GP_dynamics(agent, s, action_RL, t) else: [f, g, x, std] = dynamics_gp.get_GP_dynamics_prev( agent, s, action_RL, t) u_bar_ = cbf.control_barrier(agent, np.squeeze(s), action_RL, f, g, x, std) action_ = action_RL + u_bar_ s2, r, terminal = env.step(action_) #replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, # terminal, np.reshape(s2, (actor.s_dim,))) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(action_, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() obs.append(s) rewards.append(r) action_bar.append(u_bar_) action_BAR.append(u_BAR_) action.append(action_) s = np.copy(s2) ep_reward += r if j == 80 - 1: #writer.add_summary(summary_str, i) #writer.flush() print( '| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) reward_result[i] = ep_reward path = { "Observation": np.concatenate(obs).reshape((80, 15)), "Action": np.concatenate(action), "Action_bar": np.concatenate(action_bar), "Action_BAR": np.concatenate(action_BAR), "Reward": np.asarray(rewards) } paths.append(path) break if el <= 3: dynamics_gp.update_GP_dynamics(agent, path) agent.bar_comp.get_training_rollouts(paths) barr_loss = agent.bar_comp.train() agent.firstIter = 0 return [summary_ops, summary_vars, paths]
class Seq2Seq(object): def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99): """Calculate the running average loss via exponential decay. This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve. Args: loss: loss on the most recent eval step running_avg_loss: running_avg_loss so far summary_writer: FileWriter object to write for tensorboard step: training iteration step decay: rate of exponential decay, a float between 0 and 1. Larger is smoother. Returns: running_avg_loss: new running average loss """ if running_avg_loss == 0: # on the first iteration just take the loss running_avg_loss = loss else: running_avg_loss = running_avg_loss * decay + (1 - decay) * loss running_avg_loss = min(running_avg_loss, 12) # clip loss_sum = tf.Summary() tag_name = 'running_avg_loss/decay=%f' % (decay) loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss) self.summary_writer.add_summary(loss_sum, step) tf.logging.info('running_avg_loss: %f', running_avg_loss) return running_avg_loss def restore_best_model(self): """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory""" tf.logging.info("Restoring bestmodel for training...") # Initialize all vars in the model sess = tf.Session(config=util.get_config()) print("Initializing all variables...") sess.run(tf.initialize_all_variables()) # Restore the best model from eval dir saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name]) print("Restoring all non-adagrad variables from best model in eval dir...") curr_ckpt = util.load_ckpt(saver, sess, "eval") print("Restored %s." % curr_ckpt) # Save this model to train dir and quit new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model") new_fname = os.path.join(FLAGS.log_root, "train", new_model_name) print("Saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables new_saver.save(sess, new_fname) print("Saved.") exit() def restore_best_eval_model(self): # load best evaluation loss so far best_loss = None best_step = None # goes through all event files and select the best loss achieved and return it event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root))) for ef in event_files: try: for e in tf.train.summary_iterator(ef): for v in e.summary.value: step = e.step if 'running_avg_loss/decay' in v.tag: running_avg_loss = v.simple_value if best_loss is None or running_avg_loss < best_loss: best_loss = running_avg_loss best_step = step except: continue tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step)) return best_loss def convert_to_coverage_model(self): """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint""" tf.logging.info("converting non-coverage model to coverage model..") # initialize an entire coverage model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-coverage weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name]) print("restoring non-coverage variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_cov_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def convert_to_reinforce_model(self): """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint""" tf.logging.info("converting non-reinforce model to reinforce model..") # initialize an entire reinforce model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-reinforce weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name]) print("restoring non-reinforce variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_rl_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def setup_training(self): """Does setup before starting training (run_training)""" train_dir = os.path.join(FLAGS.log_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if FLAGS.ac_training: dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train") if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir) #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl") #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir) self.model.build_graph() # build the graph if FLAGS.convert_to_reinforce_model: assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True" self.convert_to_reinforce_model() if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" self.convert_to_coverage_model() if FLAGS.restore_best_model: self.restore_best_model() saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time # Loads pre-trained word-embedding. By default the model learns the embedding. if FLAGS.embedding: self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) word_vector = self.vocab.getWordEmbedding() self.sv = tf.train.Supervisor(logdir=train_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.model.global_step, init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None ) self.summary_writer = self.sv.summary_writer self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config()) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() # We create a separate graph for DDQN self.dqn_graph = tf.Graph() with self.dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir, is_chief=True, saver=dqn_saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.dqn.global_step, ) self.dqn_summary_writer = self.dqn_sv.summary_writer self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config()) ''' #### TODO: try loading a previously saved replay buffer # right now this doesn't work due to running DQN on a thread if os.path.exists(replaybuffer_pcl_path): tf.logging.info('Loading Replay Buffer...') try: self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb")) tf.logging.info('Replay Buffer loaded...') except: tf.logging.info('Couldn\'t load Replay Buffer file...') self.replay_buffer = ReplayBuffer(self.dqn_hps) else: self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1)) ''' self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Preparing or waiting for session...") tf.logging.info("Created session.") try: self.run_training() # this is an infinite loop until interrupted except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() if FLAGS.ac_training: self.dqn_sv.stop() def run_training(self): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" tf.logging.info("Starting run_training") if FLAGS.debug: # start the tensorflow debugger self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.train_step = 0 if FLAGS.ac_training: # DDQN training is done asynchronously along with model training tf.logging.info('Starting DQN training thread...') self.dqn_train_step = 0 self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() watcher = Thread(target=self.watch_threads) watcher.daemon = True watcher.start() # starting the main thread tf.logging.info('Starting Seq2Seq training...') while True: # repeats until interrupted batch = self.batcher.next_batch() t0=time.time() if FLAGS.ac_training: # For DDQN, we first collect the model output to calculate the reward and Q-estimates # Then we fix the estimation either using our target network or using the true Q-values # This process will usually take time and we are working on improving it. transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q-values collection time: {}'.format(time.time()-t0)) # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph with self.dqn_graph.as_default(): batch_len = len(transitions) # we use current decoder state to predict q_estimates, use_state_prime = False b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs) # we also get the next decoder state to correct the estimation, use_state_prime = True b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) # use current DQN to estimate values from current decoder state dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] #dqn_q_estimate_loss = dqn_results['loss'] # use target DQN to estimate values for the next decoder state dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov # we use the q_estimate of UNK token for all the OOV tokens q_estimates = np.concatenate([q_estimates, np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1) # modify Q-estimates using the result collected from current and target DQN. # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] # use scheduled sampling to whether use true Q-values or DDQN estimation if FLAGS.dqn_scheduled_sampling: q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DDQN based on true Q-values, # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) # Once we are done with modifying Q-values, we can use them to train the DDQN model. # In this paper, we use a priority experience buffer which always selects states with higher quality # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer. # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training # are full, the DDQN will start the training. self.replay_buffer.add(transitions) # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for # DDQN pre-training if FLAGS.dqn_pretrain: tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...') continue # if not, use the q_estimation to update the loss. results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates) else: results = self.model.run_train_steps(self.sess, batch, self.train_step) t1=time.time() # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer self.train_step = results['global_step'] # we need this to update our running average loss tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0)) printer_helper = {} printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['sampled_r'] - printer_helper['greedy_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) tf.logging.info('-------------------------------------------') self.summary_writer.add_summary(summaries, self.train_step) # write the summaries if self.train_step % 100 == 0: # flush the summary writer every so often self.summary_writer.flush() if FLAGS.ac_training: self.dqn_summary_writer.flush() if self.train_step > FLAGS.max_iter: break def dqn_training(self): """ training the DDQN network.""" try: while True: if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit() _t = time.time() self.avg_dqn_loss = [] avg_dqn_target_loss = [] # Get a batch of size dqn_batch_size from replay buffer to train the model dqn_batch = self.replay_buffer.next_batch() if dqn_batch is None: tf.logging.info('replay buffer not loaded enough yet...') time.sleep(60) continue # Run train step for Current DQN model and collect the results dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch) # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True) self.dqn_train_step = dqn_results['global_step'] self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries self.avg_dqn_loss.append(dqn_results['loss']) avg_dqn_target_loss.append(dqn_target_results['loss']) self.dqn_train_step = self.dqn_train_step + 1 tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t)) # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL with self.dqn_graph.as_default(): current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss))) tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss))) # sleeping is required if you want the keyboard interuption to work time.sleep(FLAGS.dqn_sleep_time) except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() self.dqn_sv.stop() def watch_threads(self): """Watch example queue and batch queue threads and restart if dead.""" while True: time.sleep(60) if not self.thrd_dqn_training.is_alive(): # if the thread is dead tf.logging.error('Found DQN Learning thread dead. Restarting.') self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() def run_eval(self): """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" self.model.build_graph() # build the graph saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time sess = tf.Session(config=util.get_config()) if FLAGS.embedding: sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector}) eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved self.summary_writer = tf.summary.FileWriter(eval_dir) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() dqn_graph = tf.Graph() with dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time dqn_sess = tf.Session(config=util.get_config()) dqn_train_step = 0 replay_buffer = ReplayBuffer(self.dqn_hps) running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping best_loss = self.restore_best_eval_model() # will hold the best loss achieved so far train_step = 0 while True: _ = util.load_ckpt(saver, sess) # load a new checkpoint if FLAGS.ac_training: _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint processed_batch = 0 avg_losses = [] # evaluate for 100 * batch_size before comparing the loss # we do this due to memory constraint, best to run eval on different machines with large batch size while processed_batch < 100*FLAGS.batch_size: processed_batch += FLAGS.batch_size batch = self.batcher.next_batch() # get the next batch if FLAGS.ac_training: t0 = time.time() transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q values collection time: {}'.format(time.time()-t0)) with dqn_graph.as_default(): # if using true Q-value to train DQN network, # we do this as the pre-training for the DQN network to get better estimates batch_len = len(transitions) b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] tf.logging.info('running test step on dqn_target') dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1) tf.logging.info('fixing the action q-estimates') for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] if FLAGS.dqn_scheduled_sampling: tf.logging.info('scheduled sampling on q-estimates') q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DQN based on true Q-values # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step, q_estimates) t1=time.time() else: tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step) t1=time.time() tf.logging.info('experiment: {}'.format(FLAGS.exp_name)) tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0)) printer_helper = {} loss = printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: loss = printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['sampled_r'] - printer_helper['greedy_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) # add summaries summaries = results['summaries'] train_step = results['global_step'] self.summary_writer.add_summary(summaries, train_step) # calculate running avg loss avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step)) tf.logging.info('-------------------------------------------') running_avg_loss = np.mean(avg_losses) tf.logging.info('==========================================') tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss)) tf.logging.info('==========================================') # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best') best_loss = running_avg_loss # flush the summary writer every so often if train_step % 100 == 0: self.summary_writer.flush() #time.sleep(600) # run eval every 10 minute def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) fw = open('{}/config.txt'.format(FLAGS.log_root),'w') for k,v in flags.iteritems(): fw.write('{}\t{}\n'.format(k,v)) fw.close() else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.iteritems(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print("creating model...") self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode") # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
def train(sess, env, actor, global_step): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print("Could not find old network weights") writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) i = global_step.eval() eval_acc_reward = 0 tic = time.time() eps = 1 while True: i += 1 s = env.reset() ep_ave_max_q = 0 eps *= EPS_DECAY_RATE eps = max(eps, EPS_MIN) episode_s, episode_acts, episode_rewards = [], [], [] if i % SAVE_STEP == 0: # save check point every 1000 episode sess.run(global_step.assign(i)) save_path = saver.save(sess, SUMMARY_DIR + "model.ckpt", global_step=global_step) print("Model saved in file: %s" % save_path) print("Successfully saved global step: ", global_step.eval()) for j in xrange(MAX_EP_STEPS): # print(s.shape) # Added exploration noise action = actor.predict(np.reshape(s, np.hstack((1, actor.s_dim)))) # print action s2, r, terminal, info = env.step(action) # plt.imshow(s2, interpolation='none') # plt.show() episode_s.append(s) episode_acts.append(action) episode_rewards.append(r) s = s2 eval_acc_reward += r if terminal: # stack together all inputs, hidden states, action gradients, and rewards for this episode episode_rewards = np.asarray(episode_rewards) # print('episode_rewards', episode_rewards) episode_rewards = discount_rewards(episode_rewards) # print('after', episode_rewards) # update buffer for n in range(len(episode_rewards)): replay_buffer.add(np.reshape(episode_s[n], (actor.s_dim)), episode_acts[n], episode_rewards[n], terminal, np.reshape(episode_s[n], (actor.s_dim))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, _ = replay_buffer.sample_batch( MINIBATCH_SIZE) # Update the actor policy using the sampled gradient actor.train(s_batch, a_batch, r_batch) # print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ # '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)) if i % EVAL_EPISODES == 0: # summary time_gap = time.time() - tic summary_str = sess.run( summary_ops, feed_dict={ summary_vars[0]: (eval_acc_reward + EVAL_EPISODES) / 2, }) writer.add_summary(summary_str, i) writer.flush() print ('| Success: %i %%' % ((eval_acc_reward+EVAL_EPISODES)/2), "| Episode", i, \ ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps)) tic = time.time() # print(' 100 round reward: ', eval_acc_reward) eval_acc_reward = 0 break
def train(sess, args, actor, critic): plt.ion() #开启interactive mode speedmode = 6 madr = 1.4 gapvector = [0] * 16 totalreward = [] le = 10000 options = get_options() if options.nogui: sumoBinary = checkBinary('sumo') else: sumoBinary = checkBinary('sumo-gui') leading = [] summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter( args['summary_dir'] + " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph) actor.update_target_network() critic.update_target_network() replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) for i in range(1200): # print(i) zongreward = 0 locationplot = [] speedplot = [] timeplot = [] traci.start([sumoBinary, "-c", "hello.sumocfg"]) # print('shenme') locationplot = [] speedplot = [] timeplot = [] done = 0 chusudu = 14 for i in range(0, 40): leading.append(0) for i in range(40, 70): leading.append(-1) for i in range(70, 200): leading.append(1) for step in range(100): exist_list = traci.vehicle.getIDList() if len(exist_list) > 0: traci.vehicle.setSpeed(exist_list[0], chusudu) traci.simulationStep() gapvector = [2 * chusudu] * 16 # print(gapvector) traci.vehicle.moveTo('a', 'L4_0', le) traci.vehicle.moveTo('b.0', 'L4_0', le - gapvector[0]) traci.vehicle.moveTo('b.1', 'L4_0', le - sum(gapvector[:2])) traci.vehicle.moveTo('b.2', 'L4_0', le - sum(gapvector[:3])) traci.vehicle.moveTo('b.3', 'L4_0', le - sum(gapvector[:4])) traci.vehicle.moveTo('b.4', 'L4_0', le - sum(gapvector[:5])) traci.vehicle.moveTo('b.5', 'L4_0', le - sum(gapvector[:6])) traci.vehicle.moveTo('b.6', 'L4_0', le - sum(gapvector[:7])) traci.vehicle.moveTo('b.7', 'L4_0', le - sum(gapvector[:8])) traci.vehicle.moveTo('c.0', 'L4_0', le - sum(gapvector[:9])) traci.vehicle.moveTo('c.1', 'L4_0', le - sum(gapvector[:10])) traci.vehicle.moveTo('c.2', 'L4_0', le - sum(gapvector[:11])) traci.vehicle.moveTo('c.3', 'L4_0', le - sum(gapvector[:12])) traci.vehicle.moveTo('c.4', 'L4_0', le - sum(gapvector[:13])) traci.vehicle.moveTo('c.5', 'L4_0', le - sum(gapvector[:14])) traci.vehicle.moveTo('c.6', 'L4_0', le - sum(gapvector[:15])) traci.vehicle.moveTo('c.7', 'L4_0', le - sum(gapvector[:16])) traci.simulationStep() chushiweizhi = [] exist_list = traci.vehicle.getIDList() for xx in exist_list: chushiweizhi.append(traci.vehicle.getPosition(xx)[0]) touche = leading ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): # pjz=0 initialsp = [] state2 = [] state = [] reward = [] # print() xiayimiaosudu = np.clip( traci.vehicle.getSpeed(exist_list[0]) + touche[j], 0, chusudu) traci.vehicle.setSpeed(exist_list[0], xiayimiaosudu) for xx in exist_list: traci.vehicle.setSpeedMode(xx, speedmode) initialsp.append(traci.vehicle.getSpeed(xx)) locationplot.append(traci.vehicle.getPosition(xx)[0] / 1000) speedplot.append(traci.vehicle.getSpeed(xx)) timeplot.append(j) for mm in range(1, NUM_AGENTS + 1): # touchea=exist_list[0] ziji = exist_list[mm] qianche = exist_list[mm - 1] gap = traci.vehicle.getLeader(ziji)[1] zhuangtai1 = (traci.vehicle.getSpeed(qianche) - traci.vehicle.getSpeed(ziji)) / 10 zhuangtai2 = (traci.vehicle.getSpeed(ziji) - 16) / 16 zhuangtai3 = (math.sqrt(max(gap, 0)) - 20) / 20 state.append([zhuangtai1, zhuangtai2, zhuangtai3]) action = actor.predict([state])[0] chaoguo = [0] * NUM_AGENTS for mm in range(1, NUM_AGENTS + 1): ziji = exist_list[mm] qianche = exist_list[mm - 1] zijisudu = traci.vehicle.getSpeed(ziji) qianchesudu = traci.vehicle.getSpeed(qianche) gapa = traci.vehicle.getLeader(ziji)[1] if qianchesudu - 3 < zijisudu: gap = gapa - 5 - zijisudu + max(qianchesudu - 3, 0) if gap < 0: amax = -3 # print(gap) else: # amax=math.sqrt(madr*gap)+sp[i]-sp[i+1]-3 amax = min(gap / 3, math.sqrt( madr * gap)) + qianchesudu - zijisudu - 3 amax = np.clip(amax, -3, 3) else: amax = 3 # ac=np.clip(action[mm-1][0]/10,-3,3) # if pjz==0: # ave=sum(action)/NUM_AGENTS # pjz=1 ac = np.clip(action[mm - 1][0] / 10, -3, 3) # print(j,ave,action,ac) if ac > amax: chaoguo[mm - 1] = 1 # print(action[mm-1][0]) # print(j,mm,ac,amax) nextspeed = traci.vehicle.getSpeed(exist_list[mm]) + min( amax, ac) # nextspeed=traci.vehicle.getSpeed(exist_list[mm])+ac # print(action[mm-1][0]) traci.vehicle.setSpeed(exist_list[mm], nextspeed) traci.simulationStep() # for i in NUM_AGENTS+1): # if i>0 and (po[i]>po[i-1]-5 or po[i]<-10000): # chongtu[i-1]=1 chongtu = [0] * NUM_AGENTS # print(j) for mm in range(1, NUM_AGENTS + 1): ziji = exist_list[mm] qianche = exist_list[mm - 1] # print(traci.vehicle.getPosition(ziji)[0]) if traci.vehicle.getPosition(ziji)[0] < -10000: chongtu[mm - 1] = 1 re = min((traci.vehicle.getAcceleration(ziji))**2 / 9, 1) # print(mm-1,traci.vehicle.getAcceleration(ziji),re) if chongtu[mm - 1] == 0: gap = traci.vehicle.getLeader(ziji)[1] else: gap = 0 if gap > 100: re += gap / 100 # print(mm-1,gap,re) if chaoguo[mm - 1] == 1: re += 1 if chongtu[mm - 1] == 1: re += 5 # print('chaoguo'W) # print(mm-1,chaoguo[mm-1],re) reward.append([1 - re]) done = True state2 = None replay_buffer.add(state, action, reward, done, state2) # print(reward) if replay_buffer.size() > int( args['minibatch_size']) or sum(chongtu) > 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # print(j) # print(chongtu) if j % 33 == 32: predicted_q_value, _, loss = critic.train( s_batch, a_batch, np.reshape(r_batch, (32, NUM_AGENTS, 1))) else: predicted_q_value, _, loss = critic.train( s_batch, a_batch, np.reshape(r_batch, (j % 33 + 1, NUM_AGENTS, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads) actor.update_target_network() critic.update_target_network() # print('xunlianle') replay_buffer.clear() # Log summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: np.mean(r_batch), summary_vars[1]: ep_ave_max_q / float(j + 1), summary_vars[2]: loss }) writer.add_summary(summary_str, i) writer.flush() # print(j,reward,r_batch,np.mean(r_batch)) state = [] reward = [] # print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch), # i, (ep_ave_max_q / float(j + 1)))) zongreward += np.mean(r_batch) print(j, action, chaoguo) if sum(chongtu) > 0: print(traci.vehicle.getIDCount()) print('zhuangle22222222222222222222222222') replay_buffer.clear() traci.close() sys.stdout.flush() # bre=1 break replay_buffer.clear() traci.close() sys.stdout.flush() # print(ave) # if state2!=None: # print(state,action,reward,state2) # print(totalreward,zongreward) print(j, zongreward / 9 - 1) if j > 180: totalreward.append(zongreward / 9 - 1) plt.ion() plt.figure(i * 2 - 1) plt.plot(np.arange(len(totalreward)), totalreward) plt.xlabel('Episode') plt.ylabel('Episode reward') plt.draw() plt.pause(1) plt.close() #越大越好 plt.ion() plt.figure(i * 2) plt.scatter(timeplot, locationplot, c=speedplot, s=10, alpha=0.3) plt.colorbar() plt.xlabel('Time (s)') plt.ylabel('Location (km)') plt.grid(True) plt.show() M8 = np.mat(totalreward) np.savetxt("M8.csv", M8, delimiter=',')
class Workspace(object): def __init__(self, cfg): self.work_dir = '/media/trevor/mariadb/thesis/' print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) self.env = make_env(cfg) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device, self.cfg.env) # obs_shape = (3 * 3, 84, 84) # pre_aug_obs_shape = (3 * 3, 100, 100) # # self.replay_buffer = ReplayBuffer( # obs_shape=pre_aug_obs_shape, # action_shape=self.env.action_space.shape, # capacity=cfg.replay_buffer_capacity, # batch_size=cfg.batch_size, # device=self.device, # image_size=84, # pre_image_size=100, # ) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0 def evaluate(self): average_episode_reward = 0 eps_reward = [] eps_done = 0 # while eps_done < self.cfg.num_eval_episodes: for episode in range(self.cfg.num_eval_episodes): obs = self.env.reset() # self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 episode_step = 0 while not done: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=False) # This is unnecessary here... self.agent.osl.train(True) obs, reward, done, info = self.env.step(action) # self.video_recorder.record(self.env) episode_reward += reward episode_step += 1 # if episode_reward > 0: # eps_reward.append(episode_reward) # average_episode_reward += episode_reward # eps_done += 1 # else: # continue average_episode_reward += episode_reward # self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes sd_episode_reward = np.std(eps_reward) self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) return average_episode_reward, sd_episode_reward def run(self): print(f'Eval freq: {self.cfg.eval_frequency}') print(f'k: {self.agent.k}') print(f'lr: {self.cfg.lr}') episode, episode_reward, episode_step, done = 0, 0, 1, True start_time = time.time() if self.cfg.p: print('collecting...') for _ in tqdm(range(10000)): if done: obs = self.env.reset() done = False episode_step = 0 action = self.env.action_space.sample() next_obs, reward, done, info = self.env.step(action) done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done if done: eeo = 1 else: eeo = 0 episode_reward += reward self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max, eeo) obs = next_obs episode_step += 1 print('pre-training...') for i in tqdm(range(25000)): self.agent.pretrain(self.replay_buffer, i) # reset replay buffer? self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, 100000, self.cfg.image_pad, self.device, self.cfg.env) eval_mean = [] eval_sd = [] while self.step < (self.cfg.num_train_steps // self.cfg.action_repeat): if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) means, sds = self.evaluate() eval_mean.append(means) eval_sd.append(sds) print(f'OSL: {np.mean(self.agent.osl_loss_hist[-20000:])}') # torch.save( # self.agent.critic.encoder.state_dict(), # f'/media/trevor/mariadb/thesis/msl_cartpole_encoder_{self.step * self.cfg.action_repeat}.pt' # ) self.logger.log('train/episode_reward', episode_reward, self.step) obs = self.env.reset() done = False episode_reward = 0 # TODO: at the very top, episode_step is init to 1 but here it is 0... episode_step = 0 episode += 1 self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action = self.env.action_space.sample() else: with utils.eval_mode(self.agent): action = self.agent.act(obs, sample=True) self.agent.osl.train(True) # run training update if self.step >= self.cfg.num_seed_steps: for _ in range(self.cfg.num_train_iters): self.agent.update(self.replay_buffer, self.logger, self.step) next_obs, reward, done, info = self.env.step(action) # allow infinite bootstrap # TODO: shouldn't DONE always be 0? replay buffer is NOT DONE when adding... done = float(done) done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done episode_reward += reward if done: eeo = 1 else: eeo = 0 # done_no_max should always be 0, right? self.replay_buffer.add(obs, action, reward, next_obs, done, done_no_max, eeo) obs = next_obs episode_step += 1 self.step += 1 with open( f'/media/trevor/mariadb/thesis/ksl-r-{self.cfg.env}-s{self.cfg.seed}-b{self.cfg.batch_size}-k{self.cfg.agent.params.k}-p{self.cfg.p}-mean.data', 'wb') as f: pickle.dump(eval_mean, f)
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
def train(sess, args, actor, critic, actor_noise): summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) #writer = tf.summary.FileWriter(args['summary_dir', sess.graph]) #Initilize target network weights actor.update_target_network() critic.update_target_network() #Initilize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) for i in range(int(args['max_episodes'])): #Initilize the start states of the subject vehicle and the CIPV #The state: CIPV_speed, CIPV_acceleration, distance, subject_speed #The control variable: subject_acceleration CIPV_speed = 10 CIPV_acceleration = 0 #store the CIPV acceleration at each time subject_speed = 12 distance = 20 s = [CIPV_speed, CIPV_acceleration, subject_speed, distance] ep_reward = 0 ep_ave_max_q = 0 terminal = False CIPV_speed_list = [10] CIPV_acceleration_list = [0] subject_speed_list = [12] distance_list = [20] desired_headway_list = [1.5] headway_list = [1.667] action_list = [0] for j in range(int(args['max_episode_len'])): # Add exploration noise a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() if i == 0 and (j == 0 or j == 1): print s #sample_time = 0.02s sample_time = 0.02 if j >= 0 and j < 800: CIPV_acceleration = 0 if j >= 801 and j < 1600: CIPV_acceleration = 0.2 if j >= 1601: CIPV_acceleration = 0 CIPV_speed_ = CIPV_speed + CIPV_acceleration * sample_time subject_speed_ = subject_speed + a * sample_time distance_ = distance + CIPV_speed * sample_time + 0.5 * CIPV_acceleration * sample_time * sample_time - \ subject_speed * sample_time + 0.5 * a * sample_time * sample_time headway = distance_ / subject_speed_ #desired headway = 1.5s, threshold = 0.3s desired_headway = 1.5 if headway >= desired_headway and headway < desired_headway + 0.3: r = 4 * (desired_headway + 0.3 - headway) if headway > desired_headway - 0.3 and headway < desired_headway: r = 3 * (headway - desired_headway + 0.3) if headway >= desired_headway + 0.3 and headway <= 5: r = -2 * (headway - desired_headway - 0.3) if headway <= desired_headway - 0.3 and headway >= 0.2: r = -1 * (desired_headway - 0.3 - headway) #Is collision or not, if true, terminal = true if distance_ <= 0 or subject_speed < 0 or headway < 0 or headway > 5 or subject_speed > 33.33: terminal = True else: terminal = False #The next envirnoment state s2 = [CIPV_speed_, CIPV_acceleration, subject_speed_, distance_] CIPV_speed_list.append(CIPV_speed_) CIPV_acceleration_list.append(CIPV_acceleration) subject_speed_list.append(subject_speed_) distance_list.append(distance_) desired_headway_list.append(desired_headway) headway_list.append(headway) action_list.append(a) #add to buffer replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) #calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) #Update the critic given the targets predict_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predict_q_value) print('Action: {:.4f} | Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Headway: {:.4f} | Distance: {:.4f}'.format(float(a), int(ep_reward), \ i, (ep_ave_max_q / float(j)), float(headway), float(distance))) #Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) #Update target networks actor.update_target_network() critic.update_target_network() s = s2 CIPV_speed = CIPV_speed_ subject_speed = subject_speed_ distance = distance_ ep_reward += r if terminal: print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f} | Headway: {:.4f} | Distance: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)), float(headway), float(distance))) break if i % 200 == 0: data = [] data.append(CIPV_speed_list) data.append(subject_speed_list) data.append(CIPV_acceleration_list) data.append(action_list) data.append(desired_headway_list) data.append(headway_list) data.append(distance_list) data_array = np.array(data) filename = 'data' + str(i) + '.csv' np.savetxt(filename, data_array, delimiter=',')
class Agent: def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, actor_learning_rate=1e-4, critic_learning_rate=1e-3, discount_rate=0.99, tau=0.1, steps_per_update=4, action_range=None, dropout_p=0.0, weight_decay=0.0001, noise_max=0.2, noise_decay=1.0, n_agents=1 ): self.device: torch.device = device self.state_size = state_size self.action_size = action_size self.critic_control = Critic(state_size, action_size).to(device) self.critic_control.dropout.p = dropout_p self.critic_target = Critic(state_size, action_size).to(device) self.critic_target.eval() self.critic_optimizer = torch.optim.Adam( self.critic_control.parameters(), weight_decay=weight_decay, lr=critic_learning_rate) self.actor_control = Actor(state_size, action_size, action_range).to( device) self.actor_control.dropout.p = dropout_p self.actor_target = Actor(state_size, action_size, action_range).to( device) self.actor_target.eval() self.actor_optimizer = torch.optim.Adam( self.actor_control.parameters(), weight_decay=weight_decay, lr=actor_learning_rate) self.batch_size = batch_size self.min_buffer_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size) self.discount_rate = discount_rate self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update self.noise_max = noise_max self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max) self.noise_decay = noise_decay self.last_score = float('-inf') def policy(self, state, add_noise=True): state = torch.from_numpy(state).float().to(self.device) self.actor_control.eval() with torch.no_grad(): action = self.actor_control(state).cpu().numpy() self.actor_control.train() if add_noise: noise = self.noise.sample() action += noise return action def step(self, state, action, reward, next_state, done): p = self.calculate_p(state, action, reward, next_state, done) for i in range(state.shape[0]): self.replay_buffer.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i], p[i]) if self.step_count % self.steps_per_update == 0: self.learn() self.step_count += 1 def learn(self): if len(self.replay_buffer) < self.min_buffer_size: return indicies, (states, actions, rewards, next_states, dones, p) = \ self.replay_buffer.sample(self.batch_size) self.actor_control.eval() error = self.bellman_eqn_error( states, actions, rewards, next_states, dones) self.actor_control.train() importance_scaling = (self.replay_buffer.buffer_size * p) ** -1 importance_scaling /= importance_scaling.max() self.critic_optimizer.zero_grad() loss = (importance_scaling * (error ** 2)).sum() / self.batch_size loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() expected_actions = self.actor_control(states) critic_score = self.critic_control(states, expected_actions) loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size loss.backward() self.actor_optimizer.step() self.update_target(self.critic_control, self.critic_target) self.update_target(self.actor_control, self.actor_target) self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3) def bellman_eqn_error(self, states, actions, rewards, next_states, dones): """Double DQN error - use the control network to get the best action and apply the target network to it to get the target reward which is used for the bellman eqn error. """ next_actions = self.actor_control(next_states) target_action_values = self.critic_target(next_states, next_actions) target_rewards = ( rewards + self.discount_rate * (1 - dones) * target_action_values ) current_rewards = self.critic_control(states, actions) error = current_rewards - target_rewards return error def calculate_p(self, state, action, reward, next_state, done): next_state = torch.from_numpy(next_state).float().to( self.device) state = torch.from_numpy(state).float().to(self.device) action = torch.from_numpy(action).float().to(self.device) reward = torch.from_numpy(reward).float().to(self.device) done = torch.from_numpy(done).float().to( self.device) done = done.unsqueeze(1) reward = reward.unsqueeze(1) self.actor_control.eval() self.critic_control.eval() with torch.no_grad(): retval = abs( self.bellman_eqn_error(state, action, reward, next_state, done)) + 1e-3 self.critic_control.train() self.actor_control.train() return retval def update_target(self, control, target): for target_param, control_param in zip( target.parameters(), control.parameters()): target_param.data.copy_( self.tau * control_param.data + (1.0 - self.tau) * target_param.data) def end_of_episode(self, final_score): self.step_count = 0 self.noise.sigma *= self.noise_decay self.last_score = final_score self.noise.reset() def save(self, path): torch.save(self.critic_control.state_dict(), path + '-critic.p') torch.save(self.actor_control.state_dict(), path + '-actor.p') def restore(self, path): self.critic_control.load_state_dict( torch.load(path + '-critic.p', map_location='cpu')) self.actor_control.load_state_dict( torch.load(path + '-actor.p', map_location='cpu'))
class Agent(): """This is the Agent class, implementing agen that will interacts with and learns from the environment.""" def __init__( self, state_size=None, # state space size action_size=None, # action size buffer_size=int(1e6), # replay buffer size batch_size=128, # minibatch size gamma=0.99, # discount factor tau=1e-3, # for soft update of target parameters lr_actor=1e-4, # learning rate of the actor lr_critic=1e-3, # learning rate of the critic weight_decay=0, # L2 weight decay random_seed=0 ): self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size # replay buffer size self.batch_size = batch_size # minibatch size self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.lr_actor = lr_actor # learning rate of the actor self.lr_critic = lr_critic # learning rate of the critic self.weight_decay = weight_decay # L2 weight decay self.seed = random.seed(random_seed) # Actor Network ( Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network ( Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # Noise self.noise = OUNoise(action_size, random_seed) # Replay buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ############################### # update critic # ############################## # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() ############################### # # update actor network # ############################## # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ############################### # update target network # ############################## self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Workspace(object): def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.observation_space_shape = (16, 16) self.device = device self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency, agent=cfg.agent.name) utils.set_seed_everywhere(cfg.seed) self.env = make_env(cfg.env) self.max_episode_steps = cfg.max_episode_steps cfg.agent.params.obs_dim = self.observation_space_shape # SET action_dim = env.action_space.n cfg.agent.params.action_dim = (self.env.action_space.n) cfg.agent.params.action_range = [ float(0), float(self.env.action_space.n) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.observation_space_shape, (self.env.action_space.n), int(cfg.replay_buffer_capacity), self.device) ''' self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) ''' self.step = 0 def evaluate(self): print("evaluate") average_episode_reward = 0 for episode in range(self.cfg.num_eval_episodes): self.env.reset() obs = get_grid_state(self.env) self.agent.reset() # self.video_recorder.init(enabled=(episode == 0)) done = False episode_reward = 0 step_count = 0 while not done and step_count < self.max_episode_steps: with utils.eval_mode(self.agent): action_vec = self.agent.act(obs, sample=False) # TRANSFORM action_vec to action action = self.cont_to_disc(action_vec) step_count += 1 _, reward, done, _ = self.env.step(action) obs = get_grid_state(self.env) # self.video_recorder.record(self.env) episode_reward += reward average_episode_reward += episode_reward # self.video_recorder.save(f'{self.step}.mp4') average_episode_reward /= self.cfg.num_eval_episodes self.logger.log('eval/episode_reward', average_episode_reward, self.step) self.logger.dump(self.step) def cont_to_disc(self, action_vec): # action_vec shape 1 x k, where k == env.action_space.n # print(action_vec.shape) # print(type(action_vec)) action_vec_softmax = softmax(action_vec) disc_action = list( np.random.multinomial(1, action_vec_softmax, size=1)[0]).index(1) return disc_action def run(self): episode, episode_reward, done = 0, 0, True start_time = time.time() rewards = [] while self.step < self.cfg.num_train_steps: if done: if self.step > 0: self.logger.log('train/duration', time.time() - start_time, self.step) start_time = time.time() self.logger.dump( self.step, save=(self.step > self.cfg.num_seed_steps)) # evaluate agent periodically if self.step > 0 and self.step % self.cfg.eval_frequency == 0: self.logger.log('eval/episode', episode, self.step) self.evaluate() rewards.append(episode_reward) self.logger.log('train/episode_reward', episode_reward, self.step) self.env.reset() obs = get_grid_state(self.env) self.agent.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 # print("episode", episode) self.logger.log('train/episode', episode, self.step) # sample action for data collection if self.step < self.cfg.num_seed_steps: action_vec = torch.from_numpy( np.random.normal(0, 1, self.env.action_space.n)) else: with utils.eval_mode(self.agent): action_vec = self.agent.act(obs, sample=True) # TODO: transform action_vec into action action = self.cont_to_disc(action_vec) # print("before update") # run training update if self.step >= self.cfg.num_seed_steps: self.agent.update(self.replay_buffer, self.logger, self.step) # print("after update") # print(action_vec.shape, type(action_vec), action_vec) _, reward, done, _ = self.env.step(action) # if done: # print("done") next_obs = get_grid_state(self.env) # allow infinite bootstrap done = float(done) or episode_step + 1 == self.max_episode_steps done_no_max = 0 if episode_step + 1 == self.max_episode_steps else done episode_reward += reward self.replay_buffer.add(obs, action_vec, action, reward, next_obs, done, done_no_max) obs = next_obs episode_step += 1 self.step += 1 if self.step % 100 == 0: print("----- Mean Ep Reward ----- ", sum(rewards) / 100) rewards = []
def train(sess, env, args, actor, critic, actor_noise): # Set up summary operations summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Mini-batch size multipler for annealing mini_batch_multiplier = 1 last_evaluate = 999 # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) a_list = [] for i in range(int(args['max_episodes'])): # Reset the environment, initial action 0, and initialize the action list for observability during analysis s = env.reset(random_init=False) actor_noise.reset() a = 0 # Evaluation Period eval_time = 999 # Episode reward and episode average max Q initializations ep_reward = 0 ep_ave_max_q = 0 # Initialize zero mean and st_dev. Will be corrected before mean = env.xs st_dev = [1] if i % 50 == 0 and i != 0: print("Evaluation Episode") # Loop for max_episode_len for j in range(1, int(args['max_episode_len']) + 1): # Take action every "sampling time" time steps to ensure steady state is reached if j % int(args['sampling_time']) == 0: # Correct for the initial state bug if j == int(args['sampling_time']): s = deepcopy(env.x[j - 1, :]) # Normalize the states by subtracting the mean and dividing by the variance s -= mean s /= st_dev # Every 50th episode, the action will have no noise to evaluate performance. if i % 50 == 0 and i != 0: a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Add Ornstein-Ulhenbeck exploration noise to the action else: noise = actor_noise() a = actor.predict(np.reshape(s, (1, actor.s_dim))) + noise # Decay the actor noise, complete decay once ~95% of the episodes are finished actor_noise.noise_decay( int(args['max_episodes']) * int(args['max_episode_len'])) # Take the action env.u[j, :] = env.u[j - 1, 0] + a[0] # Define evaluation time for feedback eval_time = j + int(args['sampling_time']) - 1 else: # If it is not the sampling time, keep input constant env.u[j, :] = env.u[j - 1, :] """ Next step simulation """ # Simulate the next step env.x[j, :] = odeint(env.ode, env.x[j - 1, :], [env.t[j - 1], env.t[j]], args=([env.u[j, 0]], ))[-1] # Disturbances # if j % 20 == 0: # env.x[j, 1] -= 5 # Determines if its the end of the current episode. Also used for soft constraints if j == env.Nsim: terminal = True else: terminal = False # Feedback for RL if j == eval_time: # Ensure feedback is evaluated correctly assert ((j + 1) % int(args['sampling_time']) == 0) # Reward for RL r = env.reward_function(j) # print(r) # Next state for RL s2 = deepcopy(env.x[j, :]) # Add the latest states, action, reward, terminal, and new state to the replay memory replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Update the new state to be the current state s = s2 # Add the step's reward towards the whole episodes' reward ep_reward += r # Keep adding experience to the memory until there are at least mini-batch size samples # Batch Training area if replay_buffer.size() > int(args['minibatch_size'] * 5): # mini-batch size, also must update actor batch size if i % 50 == 0 and i != 0 and last_evaluate != i: last_evaluate = i mini_batch_multiplier += 1 mini_batch_size = mini_batch_multiplier * int( args['minibatch_size']) actor.batch_size = mini_batch_size # Obtain a batch of data from replay buffer s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(mini_batch_size)) # Calculate critic target Q-value, feeding in the actor target action # States is the s2 from the replay buffer target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Calculate the Q values y_i = [] for k in range(int(mini_batch_size)): # Terminal state, Q = r because there is no additional trajectory beyond this point if t_batch[k]: y_i.append(r_batch[k]) # If state is not terminal, Q = r + gamma * argmax-a * Q(s', a) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) """ Update the critic given the targets Exact algorithm: critic.train() returns predicted_q_value, optimize. Optimize takes MSE of y_i and predicted q value out. Then does Adam Gradient Descent updating the critic network. """ predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(mini_batch_size), 1))) # Output is 64 dimen predicted_q_value, then find the max of them. ep_ave_max_q += np.amax(predicted_q_value) """ Update the actor policy using the sampled gradient """ # Scaled output action given the s_batch states. a_outs = actor.predict(s_batch) # Inputs the states, and the actions given those states. # Forms symbolic function of the gradients as a function of the action grads = critic.action_gradients(s_batch, a_outs) # Updates actors given the gradients actor.train(s_batch, grads[0]) # Update target networks by tau actor.update_target_network() critic.update_target_network() if terminal: # Update the summary ops summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) # Decaying learning rate # if i > 100: # actor.learning_rate = actor.learning_rate * 0.9 # critic.learning_rate = critic.learning_rate * 0.9 break return replay_buffer, a_list
def train(sess, env, actor, critic): env_left = gym.make(ENV_LEFT) env_middle = gym.make(ENV_MIDDLE) env_right = gym.make(ENV_RIGHT) L = Logger() log_not_empty = L.Load(LOG_FILE) if log_not_empty: print ("Log file loaded") else: ("Creating new log file") L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('total_reward') L.AddNewLog('estimated_value') L.AddNewLog('network_random') simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) n = OUnoise(INPUT) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 n.Reset() for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j)) a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample() s2, r, terminal, info = env.step(a[0]) r += -0.5 replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) # log statistics L.AddRecord('network_left',simulator.SimulateContNeuralEpisode(actor, sess, env_left, False)) L.AddRecord('network_middle',simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False)) L.AddRecord('network_right',simulator.SimulateContNeuralEpisode(actor, sess, env_right, False)) temp_r = 0 for rand_i in xrange(10): temp_r = temp_r + simulator.SimulateContNeuralEpisode(actor, sess, env, False)*0.1 L.AddRecord('network_random', temp_r) L.AddRecord('total_reward', ep_reward) if replay_buffer.size() > V_EST: num = V_EST else: num = replay_buffer.size() s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(num) Q = critic.predict(s_batch, actor.predict(s_batch)) V_est = Q.sum()/num*1.0 L.AddRecord('estimated_value', V_est) if i % SAVE_RATE == 0: L.Save(LOG_FILE)
def main(config_dict): train = config_dict['train'] network = config_dict['network'] experiment_name = config_dict['experiment_name'] EXPERIMENTS_PATH = config_dict['EXPERIMENTS_PATH'] actor_weights_file = "%s%s/%s_actor.h5" % (EXPERIMENTS_PATH, network, network) critic_weights_file = "%s%s/%s_critic.h5" % (EXPERIMENTS_PATH, network, network) log_directory = "%s%s/%s/" % (EXPERIMENTS_PATH, network, experiment_name) BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 LRA = 0.0001 LRC = 0.001 action_dim = 3 # Steering / Acceleration / Blake state_dim = 29 # Dimension of sensor inputs #np.random.seed(42) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 done = False step = 0 epsilon = 1 exp_logger = TORCS_ExperimentLogger(log_directory, experiment_name) #directory = "%s%s/" % (EXPERIMENTS_PATH, experiment) #actor_weights_file = "%s%s_%s" % (directory, experiment, "actor.h5") #critic_weights_file = "%s%s_%s" % (directory, experiment, "critic.h5") # TensorFlow GPU config = tf.ConfigProto() # Not sure if this is really necessary, since we only have a single GPU config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) env = TorcsEnv(vision=vision, throttle=True, gear_change=False) # Weight loading if not train: try: actor.model.load_weights(actor_weights_file) critic.model.load_weights(critic_weights_file) actor.target_model.load_weights(actor_weights_file) critic.target_model.load_weights(critic_weights_file) print "Weights loaded successfully" time.sleep(2) except: print "Error in loading weights" print '-' * 60 traceback.print_exc(file=sys.stdout) print '-' * 60 assert (False) for i in xrange(episode_count): print "Episode: %i; Replay Buffer: %i" % (i, buff.count()) if np.mod(i, 3) == 0: # Relaunch TORCS every 3 episodes; memory leak error ob = env.reset(relaunch=True) else: ob = env.reset() state_t = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) total_reward = 0. # Compute rewards for j in xrange(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE # exploration factor action_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) action_t_raw = actor.model.predict( state_t.reshape( 1, state_t.shape[0])) # this call to reshape seems suboptimal noise_t[0][0] = train * max(epsilon, 0) * OU.run( action_t_raw[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train * max(epsilon, 0) * OU.run( action_t_raw[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train * max(epsilon, 0) * OU.run( action_t_raw[0][2], -0.1, 1.00, 0.05) # stochastic brake #if random.random() <= 0.1: # noise_t[0][2] = train * max(epsilon, 0) * OU.run(action_t_raw[0][2], 0.2, 1.00, 0.10) # May be able to do this a bit more concisely with NumPy vectorization action_t[0][0] = action_t_raw[0][0] + noise_t[0][0] action_t[0][1] = action_t_raw[0][1] + noise_t[0][1] action_t[0][2] = action_t_raw[0][2] + noise_t[0][2] # Raw_reward_t is the raw reward computed by the gym_torcs script. # We will compute our own reward metric from the ob object ob, raw_reward_t, done, info = env.step(action_t[0]) state_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) #reward_t = lng_trans(ob) reward_t = raw_reward_t buff.add(state_t, action_t[0], reward_t, state_t1, done) # Add replay buffer # Batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) done_indicators = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) # Can't we just use BATCH_SIZE here for k in xrange(len(batch)): if done_indicators[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.train_target_net() critic.train_target_net() exp_logger.log(ob, action_t[0], reward_t, loss) total_reward += reward_t state_t = state_t1 print("Episode", i, "Step", step, "Action", action_t, "Reward", reward_t, "Loss", loss) step += 1 if done: break if np.mod(i, 3) == 0: if (train): print("Now we save model") actor.model.save_weights(actor_weights_file, overwrite=True) #with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights(critic_weights_file, overwrite=True) #with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")
class Seq2Seq(object): def calc_running_avg_loss(self, loss, running_avg_loss, step, decay=0.99): """Calculate the running average loss via exponential decay. This is used to implement early stopping w.r.t. a more smooth loss curve than the raw loss curve. Args: loss: loss on the most recent eval step running_avg_loss: running_avg_loss so far summary_writer: FileWriter object to write for tensorboard step: training iteration step decay: rate of exponential decay, a float between 0 and 1. Larger is smoother. Returns: running_avg_loss: new running average loss """ if running_avg_loss == 0: # on the first iteration just take the loss running_avg_loss = loss else: running_avg_loss = running_avg_loss * decay + (1 - decay) * loss running_avg_loss = min(running_avg_loss, 12) # clip loss_sum = tf.Summary() tag_name = 'running_avg_loss/decay=%f' % (decay) loss_sum.value.add(tag=tag_name, simple_value=running_avg_loss) self.summary_writer.add_summary(loss_sum, step) tf.logging.info('running_avg_loss: %f', running_avg_loss) return running_avg_loss def restore_best_model(self): """Load bestmodel file from eval directory, add variables for adagrad, and save to train directory""" tf.logging.info("Restoring bestmodel for training...") # Initialize all vars in the model sess = tf.Session(config=util.get_config()) print("Initializing all variables...") sess.run(tf.initialize_all_variables()) # Restore the best model from eval dir saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name]) print("Restoring all non-adagrad variables from best model in eval dir...") curr_ckpt = util.load_ckpt(saver, sess, "eval") print("Restored %s." % curr_ckpt) # Save this model to train dir and quit new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model") new_fname = os.path.join(FLAGS.log_root, "train", new_model_name) print("Saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables new_saver.save(sess, new_fname) print("Saved.") exit() def restore_best_eval_model(self): # load best evaluation loss so far best_loss = None best_step = None # goes through all event files and select the best loss achieved and return it event_files = sorted(glob('{}/eval/events*'.format(FLAGS.log_root))) for ef in event_files: try: for e in tf.train.summary_iterator(ef): for v in e.summary.value: step = e.step if 'running_avg_loss/decay' in v.tag: running_avg_loss = v.simple_value if best_loss is None or running_avg_loss < best_loss: best_loss = running_avg_loss best_step = step except: continue tf.logging.info('resotring best loss from the current logs: {}\tstep: {}'.format(best_loss, best_step)) return best_loss def convert_to_coverage_model(self): """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint""" tf.logging.info("converting non-coverage model to coverage model..") # initialize an entire coverage model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-coverage weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name]) print("restoring non-coverage variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_cov_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def convert_to_reinforce_model(self): """Load non-reinforce checkpoint, add initialized extra variables for reinforce, and save as new checkpoint""" tf.logging.info("converting non-reinforce model to reinforce model..") # initialize an entire reinforce model from scratch sess = tf.Session(config=util.get_config()) print("initializing everything...") sess.run(tf.global_variables_initializer()) # load all non-reinforce weights from checkpoint saver = tf.train.Saver([v for v in tf.global_variables() if "reinforce" not in v.name and "Adagrad" not in v.name]) print("restoring non-reinforce variables...") curr_ckpt = util.load_ckpt(saver, sess) print("restored.") # save this model and quit new_fname = curr_ckpt + '_rl_init' print("saving model to %s..." % (new_fname)) new_saver = tf.train.Saver() # this one will save all variables that now exist new_saver.save(sess, new_fname) print("saved.") exit() def setup_training(self): """Does setup before starting training (run_training)""" train_dir = os.path.join(FLAGS.log_root, "train") if not os.path.exists(train_dir): os.makedirs(train_dir) if FLAGS.ac_training: dqn_train_dir = os.path.join(FLAGS.log_root, "dqn", "train") if not os.path.exists(dqn_train_dir): os.makedirs(dqn_train_dir) #replaybuffer_pcl_path = os.path.join(FLAGS.log_root, "replaybuffer.pcl") #if not os.path.exists(dqn_target_train_dir): os.makedirs(dqn_target_train_dir) self.model.build_graph() # build the graph if FLAGS.convert_to_reinforce_model: assert (FLAGS.rl_training or FLAGS.ac_training), "To convert your pointer model to a reinforce model, run with convert_to_reinforce_model=True and either rl_training=True or ac_training=True" self.convert_to_reinforce_model() if FLAGS.convert_to_coverage_model: assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True" self.convert_to_coverage_model() if FLAGS.restore_best_model: self.restore_best_model() saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time # Loads pre-trained word-embedding. By default the model learns the embedding. if FLAGS.embedding: self.vocab.LoadWordEmbedding(FLAGS.embedding, FLAGS.emb_dim) word_vector = self.vocab.getWordEmbedding() self.sv = tf.train.Supervisor(logdir=train_dir, is_chief=True, saver=saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.model.global_step, init_feed_dict= {self.model.embedding_place:word_vector} if FLAGS.embedding else None ) self.summary_writer = self.sv.summary_writer self.sess = self.sv.prepare_or_wait_for_session(config=util.get_config()) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() # We create a separate graph for DDQN self.dqn_graph = tf.Graph() with self.dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time self.dqn_sv = tf.train.Supervisor(logdir=dqn_train_dir, is_chief=True, saver=dqn_saver, summary_op=None, save_summaries_secs=60, # save summaries for tensorboard every 60 secs save_model_secs=60, # checkpoint every 60 secs global_step=self.dqn.global_step, ) self.dqn_summary_writer = self.dqn_sv.summary_writer self.dqn_sess = self.dqn_sv.prepare_or_wait_for_session(config=util.get_config()) ''' #### TODO: try loading a previously saved replay buffer # right now this doesn't work due to running DQN on a thread if os.path.exists(replaybuffer_pcl_path): tf.logging.info('Loading Replay Buffer...') try: self.replay_buffer = pickle.load(open(replaybuffer_pcl_path, "rb")) tf.logging.info('Replay Buffer loaded...') except: tf.logging.info('Couldn\'t load Replay Buffer file...') self.replay_buffer = ReplayBuffer(self.dqn_hps) else: self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Building DDQN took {} seconds".format(time.time()-t1)) ''' self.replay_buffer = ReplayBuffer(self.dqn_hps) tf.logging.info("Preparing or waiting for session...") tf.logging.info("Created session.") try: self.run_training() # this is an infinite loop until interrupted except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() if FLAGS.ac_training: self.dqn_sv.stop() def run_training(self): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" tf.logging.info("Starting run_training") if FLAGS.debug: # start the tensorflow debugger self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.train_step = 0 if FLAGS.ac_training: # DDQN training is done asynchronously along with model training tf.logging.info('Starting DQN training thread...') self.dqn_train_step = 0 self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() watcher = Thread(target=self.watch_threads) watcher.daemon = True watcher.start() # starting the main thread tf.logging.info('Starting Seq2Seq training...') while True: # repeats until interrupted batch = self.batcher.next_batch() t0=time.time() if FLAGS.ac_training: # For DDQN, we first collect the model output to calculate the reward and Q-estimates # Then we fix the estimation either using our target network or using the true Q-values # This process will usually take time and we are working on improving it. transitions = self.model.collect_dqn_transitions(self.sess, batch, self.train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q-values collection time: {}'.format(time.time()-t0)) # whenever we are working with the DDQN, we switch using DDQN graph rather than default graph with self.dqn_graph.as_default(): batch_len = len(transitions) # we use current decoder state to predict q_estimates, use_state_prime = False b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = False, max_art_oovs = batch.max_art_oovs) # we also get the next decoder state to correct the estimation, use_state_prime = True b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) # use current DQN to estimate values from current decoder state dqn_results = self.dqn.run_test_steps(sess=self.dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] #dqn_q_estimate_loss = dqn_results['loss'] # use target DQN to estimate values for the next decoder state dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov # we use the q_estimate of UNK token for all the OOV tokens q_estimates = np.concatenate([q_estimates, np.reshape(q_estimates[:,0],[-1,1])*np.ones((len(transitions),batch.max_art_oovs))],axis=-1) # modify Q-estimates using the result collected from current and target DQN. # check algorithm 5 in the paper for more info: https://arxiv.org/pdf/1805.09461.pdf for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] # use scheduled sampling to whether use true Q-values or DDQN estimation if FLAGS.dqn_scheduled_sampling: q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DDQN based on true Q-values, # we need to update Q-values in our transitions based on the q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) # Once we are done with modifying Q-values, we can use them to train the DDQN model. # In this paper, we use a priority experience buffer which always selects states with higher quality # to train the DDQN. The following line will add batch_size * max_dec_steps experiences to the replay buffer. # As mentioned before, the DDQN training is asynchronous. Therefore, once the related queues for DDQN training # are full, the DDQN will start the training. self.replay_buffer.add(transitions) # If dqn_pretrain flag is on, it means that we use a fixed Actor to only collect experiences for # DDQN pre-training if FLAGS.dqn_pretrain: tf.logging.info('RUNNNING DQN PRETRAIN: Adding data to relplay buffer only...') continue # if not, use the q_estimation to update the loss. results = self.model.run_train_steps(self.sess, batch, self.train_step, q_estimates) else: results = self.model.run_train_steps(self.sess, batch, self.train_step) t1=time.time() # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] # we will write these summaries to tensorboard using summary_writer self.train_step = results['global_step'] # we need this to update our running average loss tf.logging.info('seconds for training step {}: {}'.format(self.train_step, t1-t0)) printer_helper = {} printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] else: printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss)>0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) tf.logging.info('-------------------------------------------') self.summary_writer.add_summary(summaries, self.train_step) # write the summaries if self.train_step % 100 == 0: # flush the summary writer every so often self.summary_writer.flush() if FLAGS.ac_training: self.dqn_summary_writer.flush() if self.train_step > FLAGS.max_iter: break def dqn_training(self): """ training the DDQN network.""" try: while True: if self.dqn_train_step == FLAGS.dqn_pretrain_steps: raise SystemExit() _t = time.time() self.avg_dqn_loss = [] avg_dqn_target_loss = [] # Get a batch of size dqn_batch_size from replay buffer to train the model dqn_batch = self.replay_buffer.next_batch() if dqn_batch is None: tf.logging.info('replay buffer not loaded enough yet...') time.sleep(60) continue # Run train step for Current DQN model and collect the results dqn_results = self.dqn.run_train_steps(self.dqn_sess, dqn_batch) # Run test step for Target DQN model and collect the results and monitor the difference in loss between the two dqn_target_results = self.dqn_target.run_test_steps(self.dqn_sess, x=dqn_batch._x, y=dqn_batch._y, return_loss=True) self.dqn_train_step = dqn_results['global_step'] self.dqn_summary_writer.add_summary(dqn_results['summaries'], self.dqn_train_step) # write the summaries self.avg_dqn_loss.append(dqn_results['loss']) avg_dqn_target_loss.append(dqn_target_results['loss']) self.dqn_train_step = self.dqn_train_step + 1 tf.logging.info('seconds for training dqn model: {}'.format(time.time()-_t)) # UPDATING TARGET DDQN NETWORK WITH CURRENT MODEL with self.dqn_graph.as_default(): current_model_weights = self.dqn_sess.run([self.dqn.model_trainables])[0] # get weights of current model self.dqn_target.run_update_weights(self.dqn_sess, self.dqn_train_step, current_model_weights) # update target model weights with current model weights tf.logging.info('DQN loss at step {}: {}'.format(self.dqn_train_step, np.mean(self.avg_dqn_loss))) tf.logging.info('DQN Target loss at step {}: {}'.format(self.dqn_train_step, np.mean(avg_dqn_target_loss))) # sleeping is required if you want the keyboard interuption to work time.sleep(FLAGS.dqn_sleep_time) except (KeyboardInterrupt, SystemExit): tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...") self.sv.stop() self.dqn_sv.stop() def watch_threads(self): """Watch example queue and batch queue threads and restart if dead.""" while True: time.sleep(60) if not self.thrd_dqn_training.is_alive(): # if the thread is dead tf.logging.error('Found DQN Learning thread dead. Restarting.') self.thrd_dqn_training = Thread(target=self.dqn_training) self.thrd_dqn_training.daemon = True self.thrd_dqn_training.start() def run_eval(self): """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" self.model.build_graph() # build the graph saver = tf.train.Saver(max_to_keep=3) # we will keep 3 best checkpoints at a time sess = tf.Session(config=util.get_config()) if FLAGS.embedding: sess.run(tf.global_variables_initializer(),feed_dict={self.model.embedding_place:self.word_vector}) eval_dir = os.path.join(FLAGS.log_root, "eval") # make a subdir of the root dir for eval data bestmodel_save_path = os.path.join(eval_dir, 'bestmodel') # this is where checkpoints of best models are saved self.summary_writer = tf.summary.FileWriter(eval_dir) if FLAGS.ac_training: tf.logging.info('DDQN building graph') t1 = time.time() dqn_graph = tf.Graph() with dqn_graph.as_default(): self.dqn.build_graph() # build dqn graph tf.logging.info('building current network took {} seconds'.format(time.time()-t1)) self.dqn_target.build_graph() # build dqn target graph tf.logging.info('building target network took {} seconds'.format(time.time()-t1)) dqn_saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time dqn_sess = tf.Session(config=util.get_config()) dqn_train_step = 0 replay_buffer = ReplayBuffer(self.dqn_hps) running_avg_loss = 0 # the eval job keeps a smoother, running average loss to tell it when to implement early stopping best_loss = self.restore_best_eval_model() # will hold the best loss achieved so far train_step = 0 while True: _ = util.load_ckpt(saver, sess) # load a new checkpoint if FLAGS.ac_training: _ = util.load_dqn_ckpt(dqn_saver, dqn_sess) # load a new checkpoint processed_batch = 0 avg_losses = [] # evaluate for 100 * batch_size before comparing the loss # we do this due to memory constraint, best to run eval on different machines with large batch size while processed_batch < 100*FLAGS.batch_size: processed_batch += FLAGS.batch_size batch = self.batcher.next_batch() # get the next batch if FLAGS.ac_training: t0 = time.time() transitions = self.model.collect_dqn_transitions(sess, batch, train_step, batch.max_art_oovs) # len(batch_size * k * max_dec_steps) tf.logging.info('Q values collection time: {}'.format(time.time()-t0)) with dqn_graph.as_default(): # if using true Q-value to train DQN network, # we do this as the pre-training for the DQN network to get better estimates batch_len = len(transitions) b = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) b_prime = ReplayBuffer.create_batch(self.dqn_hps, transitions,len(transitions), use_state_prime = True, max_art_oovs = batch.max_art_oovs) dqn_results = self.dqn.run_test_steps(sess=dqn_sess, x= b._x, return_best_action=True) q_estimates = dqn_results['estimates'] # shape (len(transitions), vocab_size) dqn_best_action = dqn_results['best_action'] tf.logging.info('running test step on dqn_target') dqn_target_results = self.dqn_target.run_test_steps(dqn_sess, x= b_prime._x) q_vals_new_t = dqn_target_results['estimates'] # shape (len(transitions), vocab_size) # we need to expand the q_estimates to match the input batch max_art_oov q_estimates = np.concatenate([q_estimates,np.zeros((len(transitions),batch.max_art_oovs))],axis=-1) tf.logging.info('fixing the action q-estimates') for i, tr in enumerate(transitions): if tr.done: q_estimates[i][tr.action] = tr.reward else: q_estimates[i][tr.action] = tr.reward + FLAGS.gamma * q_vals_new_t[i][dqn_best_action[i]] if FLAGS.dqn_scheduled_sampling: tf.logging.info('scheduled sampling on q-estimates') q_estimates = self.scheduled_sampling(batch_len, FLAGS.sampling_probability, b._y_extended, q_estimates) if not FLAGS.calculate_true_q: # when we are not training DQN based on true Q-values # we need to update Q-values in our transitions based on this q_estimates we collected from DQN current network. for trans, q_val in zip(transitions,q_estimates): trans.q_values = q_val # each have the size vocab_extended q_estimates = np.reshape(q_estimates, [FLAGS.batch_size, FLAGS.k, FLAGS.max_dec_steps, -1]) # shape (batch_size, k, max_dec_steps, vocab_size_extended) tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step, q_estimates) t1=time.time() else: tf.logging.info('run eval step on seq2seq model.') t0=time.time() results = self.model.run_eval_step(sess, batch, train_step) t1=time.time() tf.logging.info('experiment: {}'.format(FLAGS.exp_name)) tf.logging.info('processed_batch: {}, seconds for batch: {}'.format(processed_batch, t1-t0)) printer_helper = {} loss = printer_helper['pgen_loss']= results['pgen_loss'] if FLAGS.coverage: printer_helper['coverage_loss'] = results['coverage_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['rl_cov_total_loss']= results['reinforce_cov_total_loss'] loss = printer_helper['pointer_cov_total_loss'] = results['pointer_cov_total_loss'] if FLAGS.rl_training or FLAGS.ac_training: printer_helper['shared_loss'] = results['shared_loss'] printer_helper['rl_loss'] = results['rl_loss'] printer_helper['rl_avg_logprobs'] = results['rl_avg_logprobs'] if FLAGS.rl_training: printer_helper['sampled_r'] = np.mean(results['sampled_sentence_r_values']) printer_helper['greedy_r'] = np.mean(results['greedy_sentence_r_values']) printer_helper['r_diff'] = printer_helper['greedy_r'] - printer_helper['sampled_r'] if FLAGS.ac_training: printer_helper['dqn_loss'] = np.mean(self.avg_dqn_loss) if len(self.avg_dqn_loss) > 0 else 0 for (k,v) in printer_helper.items(): if not np.isfinite(v): raise Exception("{} is not finite. Stopping.".format(k)) tf.logging.info('{}: {}\t'.format(k,v)) # add summaries summaries = results['summaries'] train_step = results['global_step'] self.summary_writer.add_summary(summaries, train_step) # calculate running avg loss avg_losses.append(self.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, train_step)) tf.logging.info('-------------------------------------------') running_avg_loss = np.mean(avg_losses) tf.logging.info('==========================================') tf.logging.info('best_loss: {}\trunning_avg_loss: {}\t'.format(best_loss, running_avg_loss)) tf.logging.info('==========================================') # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best') best_loss = running_avg_loss # flush the summary writer every so often if train_step % 100 == 0: self.summary_writer.flush() #time.sleep(600) # run eval every 10 minute def main(self, unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary flags = getattr(FLAGS,"__flags") if not os.path.exists(FLAGS.log_root): if FLAGS.mode=="train": os.makedirs(FLAGS.log_root) else: raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) fw = open('{}/config.txt'.format(FLAGS.log_root), 'w') for k, v in flags.items(): fw.write('{}\t{}\n'.format(k, v)) fw.close() self.vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'gpu_num', #'sampled_greedy_flag', 'gamma', 'eta', 'fixed_eta', 'reward_function', 'intradecoder', 'use_temporal_attention', 'ac_training','rl_training', 'matrix_attention', 'calculate_true_q', 'enc_hidden_dim', 'dec_hidden_dim', 'k', 'scheduled_sampling', 'sampling_probability','fixed_sampling_probability', 'alpha', 'hard_argmax', 'greedy_scheduled_sampling', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'dqn_scheduled_sampling', 'dqn_sleep_time', 'E2EBackProp', 'coverage', 'cov_loss_wt', 'pointer_gen'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict if FLAGS.ac_training: hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) self.hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # creating all the required parameters for DDQN model. if FLAGS.ac_training: hparam_list = ['lr', 'dqn_gpu_num', 'dqn_layers', 'dqn_replay_buffer_size', 'dqn_batch_size', 'dqn_target_update', 'dueling_net', 'dqn_polyak_averaging', 'dqn_sleep_time', 'dqn_scheduled_sampling', 'max_grad_norm'] hps_dict = {} for key,val in flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps_dict.update({'dqn_input_feature_len':(FLAGS.dec_hidden_dim)}) hps_dict.update({'vocab_size':self.vocab.size()}) self.dqn_hps = namedtuple("HParams", hps_dict.keys())(**hps_dict) # Create a batcher object that will create minibatches of data self.batcher = Batcher(FLAGS.data_path, self.vocab, self.hps, single_pass=FLAGS.single_pass, decode_after=FLAGS.decode_after) tf.set_random_seed(111) # a seed value for randomness if self.hps.mode == 'train': print("creating model...") self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: # current DQN with paramters \Psi self.dqn = DQN(self.dqn_hps,'current') # target DQN with paramters \Psi^{\prime} self.dqn_target = DQN(self.dqn_hps,'target') self.setup_training() elif self.hps.mode == 'eval': self.model = SummarizationModel(self.hps, self.vocab) if FLAGS.ac_training: self.dqn = DQN(self.dqn_hps,'current') self.dqn_target = DQN(self.dqn_hps,'target') self.run_eval() elif self.hps.mode == 'decode': decode_model_hps = self.hps # This will be the hyperparameters for the decoder model decode_model_hps = self.hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries model = SummarizationModel(decode_model_hps, self.vocab) if FLAGS.ac_training: # We need our target DDQN network for collecting Q-estimation at each decoder step. dqn_target = DQN(self.dqn_hps,'target') else: dqn_target = None decoder = BeamSearchDecoder(model, self.batcher, self.vocab, dqn = dqn_target) decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once) else: raise ValueError("The 'mode' flag must be one of train/eval/decode") # Scheduled sampling used for either selecting true Q-estimates or the DDQN estimation # based on https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/ScheduledEmbeddingTrainingHelper def scheduled_sampling(self, batch_size, sampling_probability, true, estimate): with variable_scope.variable_scope("ScheduledEmbedding"): # Return -1s where we do not sample, and sample_ids elsewhere select_sampler = bernoulli.Bernoulli(probs=sampling_probability, dtype=tf.bool) select_sample = select_sampler.sample(sample_shape=batch_size) sample_ids = array_ops.where( select_sample, tf.range(batch_size), gen_array_ops.fill([batch_size], -1)) where_sampling = math_ops.cast( array_ops.where(sample_ids > -1), tf.int32) where_not_sampling = math_ops.cast( array_ops.where(sample_ids <= -1), tf.int32) _estimate = array_ops.gather_nd(estimate, where_sampling) _true = array_ops.gather_nd(true, where_not_sampling) base_shape = array_ops.shape(true) result1 = array_ops.scatter_nd(indices=where_sampling, updates=_estimate, shape=base_shape) result2 = array_ops.scatter_nd(indices=where_not_sampling, updates=_true, shape=base_shape) result = result1 + result2 return result1 + result2
def train(sess, env, args, actor, critic, actor_noise): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
action = Noise(-bound, bound, action, 10*(EPSILON**((episode/3)+step))) # execute action a and observe reward r and observe new state s' state_prime, reward, terminal, _ = env.step(action) average += reward # store transition (s, a, r, s') in replay with error if prioritized if PRIORITIZED: q = critic.q([state], [action]) q_prime = critic.q_target( [state_prime], actor.act_target([state_prime]) ) y = reward + GAMMA*(q_prime * (1 - terminal)) loss = (y-q)**2 replay.add( state, action, reward, state_prime, terminal, e=loss[0][0] ) else: replay.add(state, action, reward, state_prime, terminal) state = state_prime if replay.size() > BATCH_SIZE: # sample a batch of transitions (s, a, r, s') from replay batch = replay.sample_batch(BATCH_SIZE) batch_state = np.reshape(batch[0], (BATCH_SIZE, s_dim)) batch_action = np.reshape(batch[1], (BATCH_SIZE, a_dim)) batch_reward = np.reshape(batch[2], (BATCH_SIZE, 1)) batch_state_prime = np.reshape(batch[3], (BATCH_SIZE, s_dim)) batch_terminal = np.reshape(batch[4], (BATCH_SIZE, 1)) idx = batch[5]
class Agent: def __init__(self, lr, state_shape, num_actions, batch_size, max_mem_size=100000): self.lr = lr self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.memory = ReplayBuffer(max_mem_size, state_shape) self.net = Network(lr, inputChannels=3, numActions=9) def choose_action(self, observation): if np.random.random() > self.epsilon.value(): state = torch.tensor(observation).float().detach() state = state.to(self.net.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action else: return np.random.choice(self.action_space) def store_memory(self, state, action, reward, state_, done, invalid_move): self.memory.add(state, action, reward, state_, done, invalid_move) def learn(self): if self.memory.mem_count < self.batch_size: return states, actions, rewards, states_, dones, invalid_moves = \ self.memory.sample(self.batch_size) states = torch.tensor(states).to(self.net.device) actions = torch.tensor(actions).to(self.net.device) rewards = torch.tensor(rewards).to(self.net.device) states_ = torch.tensor(states_).to(self.net.device) dones = torch.tensor(dones).to(self.net.device) invalid_move = torch.tensor(invalid_moves).to(self.net.device) batch_index = np.arange(self.batch_size, dtype=np.int64) q_values = self.net(states)[batch_index, actions] q_values_ = self.net(states_) action_qs_ = torch.max(q_values_, dim=1)[0] action_qs_[dones] = 0.0 q_target = rewards + self.gamma * action_qs_ td = q_target - q_values self.net.optimizer.zero_grad() loss = (td**2.0).mean() loss.backward() self.net.optimizer.step() self.epsilon.step()
class DDPG: """docstring for DDPG""" def __init__(self, state_dim, action_dim): """name for uploading resuults""" self.name = 'DDPG' self.time_step = 0 # self.atten_rate = 1 """Randomly initialize actor network and critic network""" """and both their target networks""" self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) """initialize replay buffer""" self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) """Initialize a random process the Ornstein-Uhlenbeck process for action exploration""" self.exploration_noise = OUNoise(self.action_dim) """Initialize a Treading""" self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') def train(self): # if self.time_step ==0: # print("Begins Training!!!") #print("Training Begins") self.time_step += 1 """Sample a random minibatch of N transitions from replay buffer""" """take out BATCH_SIZE sets of data""" minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) """resize the action_batch shape to [BATCH_SIZE, self.action_dim]""" action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) """Calculate y_batch(reward)""" next_action_batch = self.actor_network.target_action(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) """Update critic by minimizing the loss L (training)""" self.critic_network.train(y_batch, state_batch, action_batch) """Update the actor policy using the sampled gradient:""" action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) """Update the target networks""" self.actor_network.update_target() self.critic_network.update_target() #print("Training Finished") def noise_action(self, state): """Select action a_t according to the current policy and exploration noise""" action = self.actor_network.action(state) exp_noise = self.exploration_noise.noise() action += exp_noise # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def action(self, state): action = self.actor_network.action(state) # action[0] = np.clip(action[0], 0, 1) # action[1] = np.clip(action[1], -1, 1) return action def perceive(self, state, action, reward, next_state, done): """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer""" self.replay_buffer.add(state, action, reward, next_state, done) """Store transitions to replay start size then start training""" # if self.replay_buffer.count() % 1000 == 0: # print("The buffer count is ", self.replay_buffer.count()) if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # self.atten_rate *= 0.99995 if not self.threading.is_alive(): self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG') self.threading.start() """SAVE NETWORK""" if self.time_step % 100 == 0: print("Training_time_step:", self.time_step) if self.time_step % 1000 == 0: print("!!!!!!!save model success!!!!!!!!") self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) """Re-iniitialize the random process when an episode ends""" if done: self.exploration_noise.reset()
] hlt.send_frame(moves) game_map.get_frame() new_targets = window.get_targets(game_map, owned_squares, directions) done = [int(t.owner == id) for t in new_targets] new_states = window.prepare_for_input(game_map, new_targets, myID) rewards = reward.reward(owned_squares, old_targets, new_targets, myID) #logging.debug(rewards) for i in range(len(owned_squares)): r.add(old_states[i], directions[i], rewards[i], new_states[i], done[i]) if len(r) >= BATCH_SIZE: batch = r.get_batch(BATCH_SIZE) loss, rewar = model.train(batch) writer.save_progress(tm.content["timesteps"], loss, rewar) #if(timestep % 10 == 0): #logging.debug(model.trainable_variables[0]) tm.content["timesteps"] += 1
class DQNAgent: def __init__(self, input_shape: tuple, action_size: int, seed: int, device: str, buffer_size: int, batch_size: int, gamma: float, lr: float, tau: float, update_every: int, replay_after: int, model: nn.Module, loss: str, **kwargs): """Initialize an Agent object. Params ====== input_shape (tuple): dimension of each state (C, H, W) action_size (int): dimension of each action seed (int): random seed device(string): Use Gpu or CPU buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor lr (float): learning rate update_every (int): how often to update the network replay_after (int): After which replay to be started model(Model): Pytorch Model """ self.input_shape = input_shape self.action_size = action_size random.seed(seed) self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.lr = lr self.update_every = update_every self.replay_after = replay_after self.DQN = model self.tau = tau # Q-Network self.policy_net = self.DQN(input_shape, action_size).to(self.device) self.target_net = self.DQN(input_shape, action_size).to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed, self.device) self.t_step = 0 self.loss = loss self.criterion = nn.SmoothL1Loss() if loss == 'Huber' else nn.MSELoss() def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset # and learn if len(self.memory) > self.replay_after: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).unsqueeze(0).to(self.device) self.policy_net.eval() with torch.no_grad(): action_values = self.policy_net(state) self.policy_net.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Get expected Q values from policy model q_expected_current = self.policy_net(states) q_expected = q_expected_current.gather(1, actions.unsqueeze(1)).squeeze(1) # Get max predicted Q values (for next states) from target model q_targets_next = self.target_net(next_states).detach().max(1)[0] # Compute Q targets for current states q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) # Compute loss loss = self.criterion(q_expected, q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.policy_net, self.target_net, self.tau) # θ'=θ×τ+θ'×(1−τ) def soft_update(self, policy_model, target_model, tau): for target_param, policy_param in zip(target_model.parameters(), policy_model.parameters()): target_param.data.copy_(tau * policy_param.data + (1.0 - tau) * target_param.data) def evaluate_on_fixed_set(self, fixed_states: list) -> float: """ :param fixed_states: preprocessed fixed set of states :return: """ action_values = [] self.policy_net.eval() with torch.no_grad(): state = stack_frame(None, fixed_states[0], True) for frame in fixed_states[1:]: state_tensor = torch.from_numpy(state).unsqueeze(0).to( self.device) max_action_value = np.max( self.policy_net(state_tensor).cpu().data.numpy()) next_state = stack_frame(state, frame, False) state = next_state action_values.append(max_action_value) self.policy_net.train() return np.mean(action_values)
class DdpgAgent: """ A Deep Deterministic Policy Gradient Agent. Interacts with and learns from the environment. """ def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize an Agent object. Params ====== num_agents (int): number of agents observed at the same time. multiple agents are handled within the class. state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ if random_seed is not None: random.seed(random_seed) np.random.seed(random_seed) self.t_step = 0 # A counter that increases each time the "step" function is executed self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = ActorNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.actor_target = ActorNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_ACTOR) # self.actor_optimizer = optim.RMSprop(self.actor_local.parameters(), lr=LR_ACTOR, # weight_decay=WEIGHT_DECAY_ACTOR) # Also solves it, but Adam quicker # Critic Network (w/ Target Network) self.critic_local = CriticNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.critic_target = CriticNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_CRITIC) # self.critic_optimizer = optim.RMSprop(self.critic_local.parameters(), lr=LR_CRITIC, # weight_decay=WEIGHT_DECAY_CRITIC) # Also solves it, but Adam quicker # Make sure target is initiated with the same weight as the local network self.soft_update(self.actor_local, self.actor_target, 1) self.soft_update(self.critic_local, self.critic_target, 1) # Setting default modes for the networks # Target networks do not need to train, so always eval() # Local networks, in training mode, unless altered in code - eg when acting. self.actor_local.train() self.actor_target.eval() self.critic_local.train() self.critic_target.eval() # Action Noise process (encouraging exploration during training) # Could consider parameter noise in future as a potentially better alternative / addition if ACTION_NOISE_METHOD == 'initial': self.noise = InitialOrnsteinUhlenbeckActionNoise( shape=(num_agents, action_size), random_seed=random_seed, x0=0, mu=0, theta=NOISE_THETA, sigma=NOISE_SIGMA) elif ACTION_NOISE_METHOD == 'adjusted': self.noise = AdjustedOrnsteinUhlenbeckActionNoise( shape=(num_agents, action_size), random_seed=random_seed, x0=0, mu=0, sigma=NOISE_SIGMA, theta=NOISE_THETA, dt=NOISE_DT, sigma_delta=NOISE_SIGMA_DELTA, ) else: raise ValueError('Unknown action noise method: ' + ACTION_NOISE_METHOD) # Replay memory self.memory = ReplayBuffer( buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, sampling_method=REPLAY_BUFFER_SAMPLING_METHOD, random_seed=random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory, every UPDATE_EVERY steps if self.t_step % UPDATE_EVERY == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_action_noise=False): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval( ) # train state is set right before actual training with torch.no_grad( ): # All calcs here with no_grad, but many examples didn't do this. Weirdly, this is slower.. return np.clip( self.actor_local(states).cpu().data.numpy() + (self.noise.sample() if add_action_noise else 0), -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): reward discount factor """ states, actions, rewards, next_states, dones = experiences self.actor_local.train( ) # critic_local is always in train state, but actor_local goes into eval with acting # Critic # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if CLIP_GRADIENT_CRITIC: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() if CLIP_GRADIENT_ACTOR: torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # Soft-Update of Target Networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update target model parameters from local model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights") def train(self): # my_config.logger.debug("......enter tain......") # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise(action) # if random.random() <= 0.5: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5]) # else: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75]) noise_action = action + noise clipped_noise_action = np.clip(noise_action, 0, 1) # if (self.time_step < 5): # my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action)) return clipped_noise_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): # my_config.logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'ltr') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
total_rewards = 0 n_steps = 0 done = False state = env.reset() while not done: action = model.get_action(state) if epoch <= n_sample_epochs: next_state, reward, done, _ = env.step(env.action_space.sample()) else: next_state, reward, done, _ = env.step(action) next_state = next_state n_steps += 1 total_rewards += reward end = 0 if n_steps == env._max_episode_steps else float(done) memory.add(state, action, reward, next_state, end) state = next_state print("index: {}, steps: {}, total_rewards: {}".format( epoch, n_steps, total_rewards)) if epoch >= n_sample_epochs + start_epoch and epoch % n_epochs_per_train == 0: # Training q_vals = [] q_nexts = [] q_losses = [] policy_losses = [] alphas = [] for _ in range(n_steps_per_train): s, a, r, s_, d = memory.sample() q_val, q_next, alpha, q_loss, policy_loss = model.update(
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, update_every, seed=22, epsilon=1, epsilon_min=0.05, eps_decay=0.99): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.seed = random.seed(seed) self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # sample experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ self.epsilon = max(self.epsilon * self.eps_decay, self.epsilon_min) state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > self.epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.learn_steps += 1 # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
def train(sess, env, actor, critic): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() action_probabilities = actor.predict(np.reshape(s, (1, STATE_DIM))) #print("action probs", action_probabilities) action = choose_action(action_probabilities) #print("action", action) s2, r, done, info = env.step(action) replay_buffer.add(np.reshape(s, (actor.s_dim,)), action, r, \ done, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, done_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # action probs to actions # TODO how to deal with non-determinate policies # convert actor.predict_target(s2_batch) to actions # the problem is that critic expects actions to always be determinate, when in fact they are probab # Calculate targets # todo can we just feed real a and s batch here, no s2? # fixme critic predict expects 1D actions not 2D probabilities a_batch = np.reshape(a_batch, (len(a_batch), 1)) #print("sbshape", np.shape(s_batch), "\n a shape", np.shape(a_batch)) targnet_predicted_reward = critic.predict_target( s_batch, a_batch) #targnet_predicted_reward = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) # print("targnet prediction", targnet_predicted_reward) # this is a whole reward tensor!! # actually, we mix observations with predictions by factor gamma # fixme I think we need to get rid of this block. targ reward is single value? obs_plus_predicted_rewards = [] for k in range(MINIBATCH_SIZE): if done_batch[k]: obs_plus_predicted_rewards.append( r_batch[k]) # final timestep is just the reward else: obs_plus_predicted_rewards.append( r_batch[k] + GAMMA * targnet_predicted_reward[k]) obs_plus_predicted_rewards = np.reshape( obs_plus_predicted_rewards, (len(obs_plus_predicted_rewards), 1)) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, obs_plus_predicted_rewards) #predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(observed_rewards, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient #a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_batch) #grads = critic.action_gradients(s_batch, a_outs) # we aren't deterministic actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if done: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() # TODO checkwhich ep reward is being printed print( # TODO replace maxq with something more interesting '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j))) break
def train(sess, env, actor, critic, actor_noise, buffer_size, min_batch, ep): sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(buffer_size, 0) max_episodes = ep max_steps = 1000 score_list = [] for i in range(max_episodes): state = env.reset() score = 0 for j in range(max_steps): env.render() action = actor.predict(np.reshape(state, (1, actor.s_dim))) + actor_noise() next_state, reward, done, info = env.step(action[0]) replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # updating the network in batch if replay_buffer.size() < min_batch: continue states, actions, rewards, dones, next_states = replay_buffer.sample_batch(min_batch) target_q = critic.predict_target(next_states, actor.predict_target(next_states)) y = [] for k in range(min_batch): y.append(rewards[k] + critic.gamma * target_q[k] * (1-dones[k])) # Update the critic given the targets predicted_q_value, _ = critic.train(states, actions, np.reshape(y, (min_batch, 1))) # Update the actor policy using the sampled gradient a_outs = actor.predict(states) grads = critic.action_gradients(states, a_outs) actor.train(states, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state score += reward if done: print('Reward: {} | Episode: {}/{}'.format(int(score), i, max_episodes)) break score_list.append(score) return score_list
def train(sess, env, actor, critic, task): # Set up summary Ops summary_ops, summary_vars = build_summaries() global_step = tf.Variable(0, dtype=tf.int32) sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print("Could not find old network weights") # Initialize target network weights actor.update_target_network() critic.update_target_network() count_parameters() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) tic = time.time() last_epreward = 0 i = global_step.eval() while True: i += 1 if i > MAX_EPISODES: break print("Iteration: ", i) explore = EXPLORE_INIT * EXPLORE_DECAY**i explore = max(EXPLORE_MIN, explore) print("explore: ", explore) s = env.reset() ep_reward = 0 ep_ave_max_q = 0 states = np.zeros([MAX_EP_STEPS + 1, env.stateSpace]) if i % SAVE_STEP == 0: # save check point every xx episode # sess.run(global_step.assign(i)) save_path = saver.save(sess, SUMMARY_DIR + "model.ckpt", global_step=i) print("Model saved in file: %s" % save_path) for j in xrange(MAX_EP_STEPS + 1): # Added exploration noise # exp = np.random.rand(1, 4) * explore * env.actionLimit exp = np.random.rand(1, 4) * explore * env.actionLimit a = actor.predict(np.reshape(s, (1, 16))) + exp # a = [[2,2,2,2]] # a = actor.predict(np.reshape(s, (1, 16))) + (1. / (1. + i)) s2, terminal, info = env.step(a[0]) # print 's', s # print 's2', s2 # print j # print "action: ", a[0] # print "state: ", s2 states[j] = s2 r = task.reward(s2, terminal, info) # calculate reward basec on s2 replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() ep_reward += r if terminal: if i > 30: plot_states(states) print s[0:3] time_gap = time.time() - tic summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: (ep_reward / (j + 1)), summary_vars[1]: (ep_ave_max_q / float(j + 1)), }) writer.add_summary(summary_str, i) writer.flush() print '| Reward: %.2f' % (ep_reward/(j+1)), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap) tic = time.time() break s = np.copy(s2)
def train(sess, env, task, Qnet, global_step): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # load model if have saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(SUMMARY_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) print("global step: ", global_step.eval()) else: print("Could not find old network weights") writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights Qnet.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) i = global_step.eval() eval_acc_reward = 0 tic = time.time() eps = 1 while True: i += 1 eps = EPS_DECAY_RATE**i eps = max(eps, EPS_MIN) s = env.reset() # plt.imshow(s, interpolation='none') # plt.show() # s = prepro(s) ep_ave_max_q = 0 if i % SAVE_STEP == 0: # save check point every 1000 episode sess.run(global_step.assign(i)) save_path = saver.save(sess, SUMMARY_DIR + "model.ckpt", global_step=global_step) print("Model saved in file: %s" % save_path) print("Successfully saved global step: ", global_step.eval()) for j in xrange(MAX_EP_STEPS + 1): predicted_q_value = Qnet.predict( np.reshape(s, np.hstack((1, Qnet.s_dim)))) predicted_q_value = predicted_q_value[0] np.random.seed() action = np.argmax(predicted_q_value) if np.random.rand() < eps: action = np.random.randint(env.actionSpace) # print('eps') # print'actionprob:', action_prob # print(action) # print(a) s2, terminal, info = env.step(action) r = task.reward(s2, terminal, info) # calculate reward basec on s2 # print r, info # plt.imshow(s2, interpolation='none') # plt.show() # s2 = prepro(s2) # print(np.reshape(s, (actor.s_dim,)).shape) action_vector = action_ecoder(action, Qnet.a_dim) replay_buffer.add(np.reshape(s, (Qnet.s_dim)), np.reshape(action_vector, (Qnet.a_dim)), r, \ terminal, np.reshape(s2, (Qnet.s_dim))) eval_acc_reward += r if terminal: # print info # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = Qnet.predict_target(s2_batch) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * np.max(target_q[k])) # # Update the Qnet given the target predicted_q_value, _ = Qnet.train(s_batch, a_batch, y_i) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient # Update target networks every 1000 iter # if i%TARGET_UPDATE_STEP == 0: Qnet.update_target_network() if i % EVAL_EPISODES == 0: # summary time_gap = time.time() - tic summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: eval_acc_reward, summary_vars[1]: ep_ave_max_q / float(j + 1), }) writer.add_summary(summary_str, i) writer.flush() print s[0:3] print ('| Reward: %i ' % (eval_acc_reward/float(EVAL_EPISODES)), "| Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j+1)), ' | Time: %.2f' %(time_gap), ' | Eps: %.2f' %(eps)) tic = time.time() # print(' 100 round reward: ', eval_acc_reward) eval_acc_reward = 0 break s = s2
def train(sess, env, actor, critic, noise, reward, discrete): # Set up summary writer summary_writer = tf.summary.FileWriter(SUMMARY_DIR) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # Initialize noise ou_level = 0. for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 # Clear episode buffer episode_buffer = np.empty((0, 5), float) for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Add exploration noise if i < NOISE_MAX_EP: ou_level = noise.ornstein_uhlenbeck_level(ou_level) a = a + ou_level # Set action for discrete and continuous action spaces if discrete: action = np.argmax(a) else: action = a[0] s2, r, terminal, info = env.step(action) # Choose reward type ep_reward += r episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # Set previous state for next step s = s2 if terminal: # Reward system for episode #episode_buffer = reward.total(episode_buffer, ep_reward) episode_buffer = reward.discount(episode_buffer) # Add episode to replay buffer for step in episode_buffer: replay_buffer.add(np.reshape(step[0], (actor.s_dim,)), np.reshape(step[1], (actor.a_dim,)), step[2], \ step[3], np.reshape(step[4], (actor.s_dim,))) summary = tf.Summary() summary.value.add(tag='Reward', simple_value=float(ep_reward)) summary.value.add(tag='Qmax', simple_value=float(ep_ave_max_q / float(j))) summary_writer.add_summary(summary, i) summary_writer.flush() print('| Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j))) break
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
env: gym.Env = gym.make("BreakoutDeterministic-v0") # create raw env env = PreprocessAtari(env) observation_shape = env.observation_space.shape n_actions = env.action_space.n state_dim = observation_shape env.reset() obs, _, _, _ = env.step(env.action_space.sample()) agent = DQNAgent(state_dim, n_actions, epsilon=0.5) target_network = DQNAgent(state_dim, n_actions) exp_replay = ReplayBuffer(10) for _ in range(30): exp_replay.add(env.reset(), env.action_space.sample(), 1.0, env.reset(), done=False) target_network.load_state_dict(agent.state_dict()) # sanity checks obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample( 10) loss = compute_td_loss(obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch, agent, target_network,
agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2) #三个agent的行动 a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]] #绿球的行动 a.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) o_n_next, r_n, d_n, i_n = env.step(a) for agent_index in range(3): reward_100_list[agent_index].append(r_n[agent_index]) reward_100_list[agent_index] = reward_100_list[agent_index][-1000:] agent1_memory.add(np.vstack([o_n[0], o_n[1], o_n[2]]), np.vstack([agent1_action[0], agent2_action[0], agent3_action[0]]), r_n[0], np.vstack([o_n_next[0], o_n_next[1], o_n_next[2]]), False) agent2_memory.add(np.vstack([o_n[1], o_n[2], o_n[0]]), np.vstack([agent2_action[0], agent3_action[0], agent1_action[0]]), r_n[1], np.vstack([o_n_next[1], o_n_next[2], o_n_next[0]]), False) agent3_memory.add(np.vstack([o_n[2], o_n[0], o_n[1]]), np.vstack([agent3_action[0], agent1_action[0], agent2_action[0]]), r_n[2], np.vstack([o_n_next[2], o_n_next[0], o_n_next[1]]), False) if i > 50000: # e *= 0.9999 # agent1 train train_agent(agent1_ddpg, agent1_ddpg_target, agent1_memory, agent1_actor_target_update, agent1_critic_target_update, sess, [agent2_ddpg_target, agent3_ddpg_target])
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state, epsilon): action = self.actor_network.action(state) noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.linear_noise.noise() noise_t[1] = epsilon * self.angular_noise.noise() action = action + noise_t a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) #print(a_linear, a_angular) return [a_linear, a_angular] def action(self, state): action = self.actor_network.action(state) a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) return [a_linear, a_angular] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) if done: self.linear_noise.reset() self.angular_noise.reset() return self.time_step
def train(sess, env, actor, critic): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) break
def train(sess, env, args, actor, critic, actor_noise): # Set up summary operations summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise, OU noise will eventually get dampened out noise = actor_noise() a = actor.predict(np.reshape(s, (1, actor.s_dim))) + noise # Next step of simulation s2, r, terminal, info = env.step(a[0]) # Add the latest states, action, reward, terminal, and new state to the replay memory replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until there are at least mini-batch size samples # BATCH TRAINING AREA if replay_buffer.size() > int(args['minibatch_size']): # Obtain a batch of data from replay buffer s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( int(args['minibatch_size'])) # Calculate critic target Q-value, feeding in the actor target action # States is the s2 from the replay buffer target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Calculate the Q values y_i = [] for k in range(int(args['minibatch_size'])): # Terminal state, Q = r because there is no additional trajectory beyond this point if t_batch[k]: y_i.append(r_batch[k]) # If state is not terminal, Q = r + gamma * argmax-a * Q(s', a) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) """ Update the critic given the targets Exact algorithm: critic.train() returns predicted_q_value, optimize. Optimize takes MSE of y_i and predicted q value out. Then does Adam Gradient Descent updating the critic network. """ predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) # Output is 64 dimen predicted_q_value, then find the max of them. ep_ave_max_q += np.amax(predicted_q_value) """ Update the actor policy using the sampled gradient """ # Scaled output action given the s_batch states. a_outs = actor.predict(s_batch) # Inputs the states, and the actions given those states. # Forms symbolic function of the gradients as a function of the action grads = critic.action_gradients(s_batch, a_outs) # Updates actors given the gradients actor.train(s_batch, grads[0]) # Update target networks by tau actor.update_target_network() critic.update_target_network() # Update the new state to be the current state s = s2 # Add the step's reward towards the whole episodes' reward ep_reward += r if terminal: # Update the summary ops summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format( int(ep_reward), i, (ep_ave_max_q / float(j)))) break
state_prime, reward, terminal = env.step(action_index) else: moves = env.valid_moves() q = agent.q([state], [moves])[0] action = nonzero_max(q) action_index = (action//dim, action % dim) if episode < 200 and episode % 2 == 0: action_index = env.randomMove() action = action_index[1] + (action_index[0] * dim) state_prime, reward, terminal = env.step(action_index) moves_prime = env.valid_moves() q = agent.target_q([state_prime], [moves_prime])[0] loss = agent.get_loss(state=[state], moves=[moves], action=[action], reward=[reward], q_best=[q], terminal=[terminal]) Replay.add(state, moves, action, reward, state_prime, moves_prime, terminal, e=loss[0]) if y_data[-1] < 99: batch_state, batch_moves, batch_action, batch_reward, batch_state_prime, batch_moves_prime, batch_terminal, idx = Replay.sample_batch(BATCH_SIZE) batch_q = agent.target_q(batch_state_prime, batch_moves_prime) agent.train(state=batch_state, moves=batch_moves, action=batch_action, reward=batch_reward, q_best=batch_q, terminal=batch_terminal) agent.update_target_network() state = state_prime if (episode+1) % 100 == 0: agent.get_saver().save(sess, 'q-model/model', global_step=(episode+1)) x_data.append(episode) y_data.append(reward+random.randint(-1, 1)) if reward == 100:
class Agent: def __init__(self, actions, optimizer, convs, fcs, padding, lstm, gamma=0.99, lstm_unit=256, time_horizon=5, policy_factor=1.0, value_factor=0.5, entropy_factor=0.01, grad_clip=40.0, state_shape=[84, 84, 1], buffer_size=2e3, rp_frame=3, phi=lambda s: s, name='global'): self.actions = actions self.gamma = gamma self.name = name self.time_horizon = time_horizon self.state_shape = state_shape self.rp_frame = rp_frame self.phi = phi self._act,\ self._train,\ self._update_local = build_graph.build_train( convs=convs, fcs=fcs, padding=padding, lstm=lstm, num_actions=len(actions), optimizer=optimizer, lstm_unit=lstm_unit, state_shape=state_shape, grad_clip=grad_clip, policy_factor=policy_factor, value_factor=value_factor, entropy_factor=entropy_factor, rp_frame=rp_frame, scope=name ) # rnn state variables self.initial_state = np.zeros((1, lstm_unit), np.float32) self.rnn_state0 = self.initial_state self.rnn_state1 = self.initial_state # last state variables self.zero_state = np.zeros(state_shape, dtype=np.float32) self.initial_last_obs = [self.zero_state for _ in range(rp_frame)] self.last_obs = deque(self.initial_last_obs, maxlen=rp_frame) self.last_action = deque([0, 0], maxlen=2) self.value_tm1 = None self.reward_tm1 = 0.0 # buffers self.rollout = Rollout() self.buffer = ReplayBuffer(capacity=buffer_size) self.t = 0 self.t_in_episode = 0 def train(self, bootstrap_value): # prepare A3C update obs_t = np.array(self.rollout.obs_t, dtype=np.float32) actions_t = np.array(self.rollout.actions_t, dtype=np.uint8) actions_tm1 = np.array(self.rollout.actions_tm1, dtype=np.uint8) rewards_tp1 = self.rollout.rewards_tp1 rewards_t = self.rollout.rewards_t values_t = self.rollout.values_t state_t0 = self.rollout.states_t[0][0] state_t1 = self.rollout.states_t[0][1] # compute returns R = bootstrap_value returns_t = [] for reward in reversed(rewards_tp1): R = reward + self.gamma * R returns_t.append(R) returns_t = np.array(list(reversed(returns_t))) adv_t = returns_t - values_t # prepare reward prediction update rp_obs, rp_reward_tp1 = self.buffer.sample_rp() # prepare value function replay update vr_obs_t,\ vr_actions_tm1,\ vr_rewards_t,\ is_terminal = self.buffer.sample_vr(self.time_horizon) _, vr_values_t, _ = self._act(vr_obs_t, vr_actions_tm1, vr_rewards_t, self.initial_state, self.initial_state) vr_values_t = np.reshape(vr_values_t, [-1]) if is_terminal: vr_bootstrap_value = 0.0 else: vr_bootstrap_value = vr_values_t[-1] # compute returns for value prediction R = vr_bootstrap_value vr_returns_t = [] for reward in reversed(vr_rewards_t[:-1]): R = reward + self.gamma * R vr_returns_t.append(R) vr_returns_t = np.array(list(reversed(vr_returns_t))) # update loss = self._train( obs_t=obs_t, rnn_state0=state_t0, rnn_state1=state_t1, actions_t=actions_t, rewards_t=rewards_t, actions_tm1=actions_tm1, returns_t=returns_t, advantages_t=adv_t, rp_obs=rp_obs, rp_reward_tp1=rp_reward_tp1, vr_obs_t=vr_obs_t[:-1], vr_actions_tm1=vr_actions_tm1[:-1], vr_rewards_t=vr_rewards_t[:-1], vr_returns_t=vr_returns_t ) self._update_local() return loss def act(self, obs_t, reward_t, training=True): # change state shape to WHC obs_t = self.phi(obs_t) # last transitions action_tm2, action_tm1 = self.last_action obs_tm1 = self.last_obs[-1] # take next action prob, value, rnn_state = self._act( obs_t=[obs_t], actions_tm1=[action_tm1], rewards_t=[reward_t], rnn_state0=self.rnn_state0, rnn_state1=self.rnn_state1 ) action_t = np.random.choice(range(len(self.actions)), p=prob[0]) if training: if len(self.rollout.obs_t) == self.time_horizon: self.train(self.value_tm1) self.rollout.flush() if self.t_in_episode > 0: # add transition to buffer for A3C update self.rollout.add( obs_t=obs_tm1, reward_tp1=reward_t, reward_t=self.reward_tm1, action_t=action_tm1, action_tm1=action_tm2, value_t=self.value_tm1, terminal_tp1=False, state_t=[self.rnn_state0, self.rnn_state1] ) # add transition to buffer for auxiliary update self.buffer.add( obs_t=list(self.last_obs), action_tm1=action_tm2, reward_t=self.reward_tm1, action_t=action_tm1, reward_tp1=reward_t, obs_tp1=obs_t, terminal=False ) self.t += 1 self.t_in_episode += 1 self.rnn_state0, self.rnn_state1 = rnn_state self.last_obs.append(obs_t) self.last_action.append(action_t) self.value_tm1 = value[0][0] self.reward_tm1 = reward_t return self.actions[action_t] def stop_episode(self, obs_t, reward_t, training=True): # change state shape to WHC obs_t = self.phi(obs_t) # last transitions action_tm2, action_tm1 = self.last_action obs_tm1 = self.last_obs[-1] if training: # add transition for A3C update self.rollout.add( obs_t=obs_tm1, action_t=action_tm1, reward_t=self.reward_tm1, reward_tp1=reward_t, action_tm1=action_tm2, value_t=self.value_tm1, state_t=[self.rnn_state0, self.rnn_state1], terminal_tp1=True ) # add transition for auxiliary update self.buffer.add( obs_t=list(self.last_obs), action_tm1=action_tm2, reward_t=self.reward_tm1, action_t=action_tm1, reward_tp1=reward_t, obs_tp1=obs_t, terminal=True ) self.train(0.0) self.rollout.flush() self.rnn_state0 = self.initial_state self.rnn_state1 = self.initial_state self.last_obs = deque(self.initial_last_obs, maxlen=self.rp_frame) self.last_action = deque([0, 0], maxlen=2) self.value_tm1 = None self.reward_tm1 = 0.0 self.t_in_episode = 0