def main(): state_size = 17 action_size = 4 buffer_size = 1024 batch_size = 32 num_steps = 4096 num_samples = 1024 num_repeat = 10 gym_memory = GymReplayBuffer(buffer_size) memory = ReplayBuffer(state_size, action_size, buffer_size, batch_size, 0) # Make some convenient aliases. n = num_steps ns = state_size na = action_size # Generate random experiences ... states = np.zeros((n, ns), dtype=np.float32) actions = np.random.randint(0, na, n) rewards = np.random.uniform(0, 1, n) next_states = np.zeros((n, ns), dtype=np.float32) dones = np.random.randint(2, size=n, dtype=np.bool) ts=[] ts.append(time.time()) print('Memory') for _ in range(num_repeat): for s0, a, r, s1, d in zip(states, actions, rewards, next_states, dones): memory.add(s0, a, r, s1, d) ts.append(time.time()) for _ in range(num_repeat): for _ in range(num_samples): sample = memory.sample() ts.append(time.time()) print('Gym-Memory') for _ in range(num_repeat): for s0, a, r, s1, d in zip(states, actions, rewards, next_states, dones): gym_memory.add(s0, a, r, s1, d) ts.append(time.time()) for _ in range(num_repeat): for _ in range(num_samples): sample = gym_memory.sample(batch_size) ts.append(time.time()) print('Result') print(np.diff(ts))
class DeepqLearner: def __init__(self, env, q_func, config=DEEPQ_CONFIG, callback=None): self.env = env self.q_func = q_func self.config = config self.callback = callback # Create all the functions necessary to train the model gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config["gpu_memory_fraction"]) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10, param_noise=config["param_noise"]) act_params = { # 'make_obs_ph': make_obs_ph, # 'q_func': q_func, 'num_actions': env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the replay buffer self.config = config self.replay_buffer = None self.beta_schedule = None self.make_replay_buffer() # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.t = 0 self.episode_rewards = [0.0] self.num_episodes = 1 self.saved_mean_reward = None self.saved_episode_num = None self.episode_frames = 0 self.model_file = None self.start_time = 0 self.episode_start_time = 0 def make_replay_buffer(self): if self.config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( self.config["buffer_size"], alpha=self.config["prioritized_replay_alpha"]) if self.config["prioritized_replay_beta_iters"] is None: self.config["prioritized_replay_beta_iters"] = self.config[ "max_timesteps"] self.beta_schedule = LinearSchedule( self.config["prioritized_replay_beta_iters"], initial_p=self.config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.config["buffer_size"]) self.beta_schedule = None def run(self): reset = True obs = self.env.reset() self.start_time = time.time() self.episode_start_time = time.time() with tempfile.TemporaryDirectory() as td: td = self.config["checkpoint_path"] or td self.model_file = os.path.join(td, "model") if tf.train.latest_checkpoint(td) is not None: load_state(self.model_file) logger.log('Loaded model from {}'.format(self.model_file)) for self.t in range(self.config["max_timesteps"]): if self.callback is not None: if self.callback(locals(), globals()): break # Determine next action to take, then take that action and observe results action = self._action(obs, reset) env_action = action new_obs, rew, done, _ = self.env.step(env_action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs # Increment typical values reset = False self.episode_frames += 1 self.episode_rewards[-1] += rew # See if done with episode if done: obs = self._reset() reset = True # Do training and deepq updating as needed if self.t > self.config["learning_starts"]: if self.t % self.config["train_freq"] == 0: self._train() if self.t % self.config["target_network_update_freq"] == 0: self.update_target() def _action(self, obs, reset): # Take action and update exploration to the newest value kwargs = {} if not self.config["param_noise"]: update_eps = self.exploration.value(self.t) # update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - self.exploration.value(self.t) + self.exploration.value(self.t) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True return self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] def _train(self): try: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.config["prioritized_replay"]: experience = self.replay_buffer.sample( self.config["batch_size"], beta=self.beta_schedule.value(self.t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.config["batch_size"]) weights, batch_idxes = np.ones_like(rewards), None # Determine errors td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.config["prioritized_replay"]: new_priorities = np.abs( td_errors) + self.config["prioritized_replay_eps"] self.replay_buffer.update_priorities(batch_idxes, new_priorities) except Exception as e: self.make_replay_buffer() print(e) def _reset(self): self.attempt_print() self.attempt_checkpoint() self.episode_rewards.append(0.0) self.num_episodes += 1 self.episode_frames = 0 self.episode_start_time = time.time() return self.env.reset() def calc_mean_100ep_reward(self): if self.num_episodes <= 1: return None return round(np.mean(self.episode_rewards[-101:-1]), 1) def attempt_print(self): p_freq = self.config["print_freq"] if p_freq is not None and self.num_episodes % p_freq == 0: logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.t))) logger.record_tabular("reward - current", self.episode_rewards[-1]) logger.record_tabular("reward - mean", self.calc_mean_100ep_reward()) logger.record_tabular("reward - saved", self.saved_mean_reward) logger.record_tabular("episode # - current", self.num_episodes) logger.record_tabular("episode # - saved", self.saved_episode_num) logger.record_tabular("steps - total", self.t) logger.record_tabular("steps - episode", self.episode_frames) logger.record_tabular( "time - ep duration", str(time.time() - self.episode_start_time) + "s") logger.record_tabular("time - remaining", self.estimate_time_remaining()) logger.dump_tabular() def estimate_time_remaining(self): duration = time.time() - self.start_time if duration <= 0: return "Unknown" time_remaining = self.t / duration * (self.config["max_timesteps"] - self.t) / 60.0 suffix = "" # Format based on time if time_remaining < MINUTE: suffix = " seconds" elif time_remaining < HOUR: suffix = " minutes" time_remaining = time_remaining / MINUTE elif time_remaining < DAY: suffix = " hours" time_remaining = time_remaining / HOUR else: suffix = " days" time_remaining = time_remaining / DAY # Round remaining time and return time_remaining = round(time_remaining * 100.0) / 100.0 return str(time_remaining) + suffix def attempt_checkpoint(self): # Determine if we're going to checkpoint c_freq = self.config["checkpoint_freq"] if c_freq is not None \ and self.num_episodes > 100 \ and self.t > self.config["learning_starts"] \ and self.num_episodes % c_freq == 0: # Determine if reward is growing mean_100ep_reward = self.calc_mean_100ep_reward() if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward: if self.config["print_freq"] is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(self.saved_mean_reward, mean_100ep_reward)) self.saved_mean_reward = mean_100ep_reward self.saved_episode_num = self.num_episodes save_state(self.model_file) def save(self, save_path): print("Saving model to " + save_path) self.act.save(save_path)
def main(_): print("Used flags:", FLAGS) config = configparser.ConfigParser() config.read(FLAGS.config_file) timer = time.time() ps_hosts = FLAGS.ps_hosts.split(",") if FLAGS.ps_hosts else config.get(FLAGS.config, 'ps_hosts').split(",") worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.worker_hosts else config.get(FLAGS.config, 'worker_hosts').split(",") job = FLAGS.job_name task = FLAGS.task_index learning_rate = config.getfloat(FLAGS.config, 'learning_rate') batch_size = config.getint(FLAGS.config, 'batch_size') memory_size = config.getint(FLAGS.config, 'memory_size') target_update = config.getint(FLAGS.config, 'target_update') seed = FLAGS.seed if FLAGS.seed else config.getint(FLAGS.config, 'seed') max_comm_rounds = config.getint(FLAGS.config, 'comm_rounds') epochs = config.getint(FLAGS.config, 'start_epoch') end_epoch = config.getint(FLAGS.config, 'end_epoch') epoch_decay = config.getint(FLAGS.config, 'epoch_decay') # epoch_decay_rate = (epochs - end_epoch) / epoch_decay epoch = LinearSchedule(epoch_decay, end_epoch, epochs) backup = config.getint(FLAGS.config, 'backup') # unused in async sync = config.getboolean(FLAGS.config, 'sync') gradient_prio = False if not sync else config.getboolean(FLAGS.config, 'gradient_prio') sync_workers = len(worker_hosts)-backup mute = FLAGS.mute if FLAGS.mute else config.getboolean(FLAGS.config, 'mute') animate = 0 draw = 0 print("Config:\nps_hosts={}\nworker_hosts={}\njob_name={}\ntask_index={}\nlearning_rate={}\n" "batch_size={}\nmemory_size={}\ntarget_update={}\nseed={}\ncomm_rounds={}\nepochs={}\n" "end_epoch={}\nepoch_decay={}\nnbackup={}\nsync={}" .format(ps_hosts, worker_hosts, job, task, learning_rate, batch_size, memory_size, target_update, seed, max_comm_rounds, epochs, end_epoch, epoch_decay, backup, sync)) cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) chief = True if job == 'worker' and task == 0 else False print("/job:", job, "/task:", task, " - Chief: ", chief, sep='') # Create server server = tf.train.Server(cluster, job_name=job, task_index=task) run_code = "{}-{}-p-{}-w-{}-E-{}-b-{}-m-{}-N-{}-lr-{}-B-{}-s-{}-".\ format(datetime.now().strftime("%y%m%d-%H%M%S"), env_name, len(ps_hosts), len(worker_hosts), epochs, batch_size, memory_size, target_update, learning_rate, backup, seed) run_code += "-sync" if sync else "-async" # Set a unique random seed for each client seed = ((seed * 10) + task) random.seed(seed) if not mute: print("Run code:", run_code) # Start parameter servers if job == 'ps': server.join() # Start training with U.make_session(num_cpu=4, target=server.target) as sess: # Create the environment env = gym.make(env_name) env.seed(seed) tf.set_random_seed(seed) # Create all the functions necessary to train the model act, train, global_opt, update_target, update_weights, sync_opt, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), chief=chief, server=server, workers=sync_workers ) # Create the replay buffer replay_buffer = ReplayBuffer(memory_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) if not chief: if not mute: print("Worker {}/{} will sleep (3s) for chief to initialize variables".format(task+1, len(worker_hosts))) time.sleep(4) # Initialize the parameters and copy them to the target network. U.initialize(chief=chief) if chief: sess.run(debug['run_code'].assign(run_code)) if not mute: print("Set global run code to:", run_code) if not mute: print("initialized variables, sleeping for 1 sec") time.sleep(2) if not chief: while not sess.run(tf.is_variable_initialized(debug['run_code'])): if not mute: print("Global run code not yet initialized") time.sleep(2) run_code = str(sess.run(debug['run_code']).decode()) if run_code == '': if not mute: print("Run code empty. Trying to fetch again...") time.sleep(5) if not mute: print("Read global run code:", run_code) run_code += "(w" + str(task) + ")" print("Final run_code:", run_code) t_global_old = update_weights()[0][0] update_target() exp_gen = 1000 # For how many timesteps sould we only generate experience (not train) t_start = exp_gen comm_rounds = 0 comm_rounds_global = 0 dt = 0 write_csv(run_code, log=["episode", "reward" + str(task), "avg_reward" + str(task), "t_global", "cr"]) episode_rewards = [0.0] cr_reward = 0 obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew cr_reward += rew # Animate every <animate> episodes if not mute and chief and animate > 0 and (len(episode_rewards) % animate) == 0: if done: print("ep", len(episode_rewards), "ended with reward:", episode_rewards[-1]) env.render() if done: if not mute and chief and draw > 0 and len(episode_rewards) % draw == 0: env.render() avg_rew = np.round(np.mean(np.array(episode_rewards[-100:])), 1) write_csv(run_code, [len(episode_rewards), episode_rewards[-1], avg_rew, debug['t_global']()[0], comm_rounds_global]) obs = env.reset() episode_rewards.append(0) [converged] = sync_opt['check_converged']() is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= max_reward or converged if is_solved or comm_rounds >= max_comm_rounds: sync_opt['set_converged']([True]) if not mute: print("Converged was set to", sync_opt['check_converged']()[0]) write_csv_final(run_code, str(len(episode_rewards)), worker_hosts, chief, comm_rounds_global, mute) print("Converged after: ", len(episode_rewards), "episodes") print("Agent total steps:", t) print("Global steps: ", debug['t_global']()[0]) sec = round(time.time() - timer) print("Total time:", sec // 3600, "h", (sec % 3600) // 60, "min", sec % 60, "s") return else: if t >= exp_gen: # if t >= batch_size: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) td_error = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if t - t_start >= np.round(epoch.value(comm_rounds)): cr_old = comm_rounds_global # Apply gradients to weights in PS if sync: # Tell the ps we are done and want to submit score [[comm_rounds_global], [worker_count]] = sync_opt['request_submit']() if comm_rounds_global == comm_rounds: if worker_count <= sync_workers: # If allowed to submit score, do it [comm_rounds_global] = sync_opt['submit_score']([cr_reward]) if chief: [submits] = sync_opt['set_submit']([0]) while worker_count != sync_workers: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_wc") break worker_count = sync_opt['check_wc']()[0] while sync_opt['check_submit']()[0] == -1: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_submit") break pass if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Continuing before submit") continue # Now all eligible workers have sent their score and gradient round has started # Submit gradient # TODO 4th argument overrides everything else unles it is set to -1 in the code [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [cr_reward], [1/len(worker_hosts)], [True]) submits = sync_opt['inc_submit']() if chief: while not sync_opt['check_submit']()[0] == sync_workers: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_submit (chief)") break pass # print("Round", comm_rounds, "finished") [w] = sync_opt['reset_wc']()[0] # print("Worker count reset to:", w) sync_opt['reset_score']() submits = sync_opt['set_submit']([-1]) # print("Submit round finished. Submits set to:", submits[0]) [r] = sync_opt['inc_comm_round']()[0] # print("New round started:", r) # Normal workers wait until GCR > CR if not chief: while sync_opt['check_round']()[0] <= comm_rounds: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_round") break # print("Worker submitted, waiting for next round:", comm_rounds + 1) # time.sleep(0.1) pass else: #elif worker_count > sync_workers: # If not allowed to submit score, wait for next round to start if not mute: print("Worker finished too late but before new round started (", comm_rounds_global, ")") print("WC(", worker_count, ") > N(", sync_workers, ")", sep="") target = np.floor(comm_rounds_global + 1) # +1 if x.0, +0.5 if x.5 while not sync_opt['check_round']()[0] >= target: pass elif comm_rounds_global > comm_rounds: # This means the worker is behind. Do nothing and start next round if not mute: print("Communication round ", comm_rounds, "missed. Actual round:", comm_rounds_global) # TODO How to handle round count when skipping rounds? comm_rounds = comm_rounds_global - 1 elif comm_rounds_global < comm_rounds: print("WARNING! Worker ahead of global:", comm_rounds, ">", comm_rounds_global) time.sleep(5) else: sync_opt['inc_comm_round']() [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [0], [-1], [False]) # Update the local weights with the new global weights from PS t_global_old = update_weights()[0][0] comm_rounds += 1 # print("Round finished. Increasing local comm_round to:", comm_rounds) cr_reward = 0 # TODO RE-ENABLE comm-rounds LOGGING # write_csv(run_code, [comm_rounds, t, dt, epoch.value(comm_rounds)], comm_rounds=True) t_start = t if t % target_update == 0: update_target() if not mute and done and len(episode_rewards) % 10 == 0: last_rewards = episode_rewards[-101:-1] logger.record_tabular("steps", t) logger.record_tabular("global steps", debug['t_global']()[0]) logger.record_tabular("communication rounds", comm_rounds) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", np.round(np.mean(episode_rewards[-101:-1]), 4)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.record_tabular("last gradient factor", np.round(factor, 4)) logger.dump_tabular() rew_ill = ['●' if x >= max_reward else str(int(np.floor(x / (max_reward/10)))) if x >= (max_reward/10) else '_' for x in last_rewards] streak = 0 for i in reversed(rew_ill): if i == "●": streak += 1 else: break #print("[" + ''.join(rew_ill) + "] ([● " + str(rew_ill.count('●')) + " | " + str(rew_ill.count('9')) + " | " + str(rew_ill.count('8')) + " | " + str(rew_ill.count('7')) + " | " + str(rew_ill.count('6')) + " | " + str(rew_ill.count('5')) + " | " + str(rew_ill.count('4')) + " | " + str(rew_ill.count('3')) + " | " + str(rew_ill.count('2')) + " | " + str(rew_ill.count('1')) + " | " + str(rew_ill.count('_')) + " _]/" + str(len(rew_ill)) + " {S:" + str(streak) + "})", sep='')
replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs env.render() episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result print("Total Number of Episodes: ", len(episode_rewards)) print("t final value: ", t) break else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("episode reward", episode_rewards[-2]) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
IsPlot = True else: IsPlot = False if (sample_time % train_freq == 0): states = np.vstack(states) actions_idx = np.vstack(actions_idx) actions = np.array(actions) rewards_tmp = rewards.copy() last_value = expected_sarsa(model,last_state,K,C,action_low,action_high,False,random_choose,num=100) rewards_tmp.append(last_value) Q_target = discount_with_dones(rewards_tmp, dones+[last_done], gamma) Q_target = np.float32(np.vstack(Q_target))[:-1] R_buffer_sample = replay_buffer.sample(np.min([minibatch,timestep])) next_states_sampled = np.squeeze(R_buffer_sample[3], axis=1) dones_sampled = R_buffer_sample[4] reward_sampled = R_buffer_sample[2] last_v = [expected_sarsa(model,np.reshape(state_tmp,(1,-1)),K,C,action_low,action_high,True,random_choose,num=100) for state_tmp in next_states_sampled] last_v = np.vstack(last_v) Q_target_hist = reward_sampled + last_v * (1-dones_sampled) * gamma states_sampled1 = np.squeeze(R_buffer_sample[0], axis=1) states_sampled2 = states states_sampled = np.concatenate((states_sampled1,states_sampled2), axis = 0) actions_sampled1 = R_buffer_sample[1] actions_sampled2 = actions actions_sampled = np.concatenate((actions_sampled1, actions_sampled2), axis = 0) target = np.reshape(np.concatenate((Q_target_hist, Q_target), axis = 0), (-1))
def train_policy(arglist): with U.single_threaded_session(): # Create the environment if arglist.use_dense_rewards: print("Will use env MineRLNavigateDense-v0") env = gym.make("MineRLNavigateDense-v0") env_name = "MineRLNavigateDense-v0" else: print("Will use env MineRLNavigate-v0") env = gym.make('MineRLNavigate-v0') env_name = "MineRLNavigate-v0" if arglist.force_forward: env = MineCraftWrapperSimplified(env) else: env = MineCraftWrapper(env) if not arglist.use_demonstrations: # Use stack of last 4 frames as obs env = FrameStack(env, 4) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=build_q_func('conv_only', dueling=True), num_actions=env.action_space.n, gamma=0.9, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer(s) (TODO: Use prioritized replay buffer) if arglist.use_demonstrations: replay_buffer = ReplayBuffer(int(arglist.replay_buffer_len / 2)) demo_buffer = load_demo_buffer(env_name, int(arglist.replay_buffer_len / 2)) else: replay_buffer = ReplayBuffer(arglist.replay_buffer_len) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule( schedule_timesteps=arglist.num_exploration_steps * arglist.num_episodes * arglist.max_episode_steps, initial_p=1.0, final_p=arglist.final_epsilon) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] n_episodes = 0 n_steps = 0 obs = env.reset() log_path = "./learning_curves/minerl_" + str(date.today()) + "_" + str( time.time()) + ".dat" log_file = open(log_path, "a") for episode in range(arglist.num_episodes): print("Episode: ", str(episode)) done = False episode_steps = 0 while not done: # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(n_steps))[0] new_obs, rew, done, _ = env.step(action) n_steps += 1 episode_steps += 1 # Break episode if episode_steps > arglist.max_episode_steps: done = True # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs # Store rewards episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) n_episodes += 1 # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if (n_steps > arglist.learning_starts_at_steps) and (n_steps % 4 == 0): obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if arglist.use_demonstrations: if (n_steps < arglist.learning_starts_at_steps) and ( n_steps % 4 == 0): obses_t, actions, rewards, obses_tp1, dones = demo_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if (n_steps > arglist.learning_starts_at_steps) and ( n_steps % 4 == 0): obses_t, actions, rewards, obses_tp1, dones = demo_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if n_steps % arglist.target_net_update_freq == 0: update_target() # Log data for analysis if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", n_steps) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(n_steps))) logger.dump_tabular() #TODO: Save checkpoints if n_steps % arglist.checkpoint_rate == 0: checkpoint_path = "./checkpoints/minerl_" + str( episode) + "_" + str(date.today()) + "_" + str( time.time()) + ".pkl" save_variables(checkpoint_path) print("%s,%s,%s,%s" % (n_steps, episode, round(np.mean(episode_rewards[-101:-1]), 1), int(100 * exploration.value(n_steps))), file=log_file) log_file.close()
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, callback=None): sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} update_eps = exploration.value(t) action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
class DeepQ(object): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ def __init__(self, env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, max_episodes=100): self.env = env self.q_func = q_func self.lr = lr self.max_timesteps = max_timesteps self.buffer_size = buffer_size self.exploration_fraction = exploration_fraction self.exploration_final_eps = exploration_final_eps self.train_freq = train_freq self.batch_size = batch_size self.print_freq = print_freq self.checkpoint_freq = checkpoint_freq self.learning_starts = learning_starts self.gamma = gamma self.target_network_update_freq = target_network_update_freq self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = prioritized_replay_alpha self.prioritized_replay_beta0 = prioritized_replay_beta0 self.prioritized_replay_beta_iters = prioritized_replay_beta_iters self.prioritized_replay_eps = prioritized_replay_eps self.param_noise = param_noise self.callback = callback self.max_episodes = max_episodes # Create all the functions necessary to train the model self.sess = tf.Session() self.sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph self.observation_space_shape = env.observation_space.shape def make_obs_ph(self, name): return U.BatchInput(self.observation_space_shape, name=name) def make_build_train(self): # Build act and train networks self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=self.make_obs_ph, q_func=self.q_func, num_actions=self.env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=self.lr), gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise) self.act_params = { 'make_obs_ph': self.make_obs_ph, 'q_func': self.q_func, 'num_actions': self.env.action_space.n, } self.act = ActWrapper(self.act, self.act_params) return 'make_build_train() complete' def initialize(self): # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. # self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * self.max_timesteps), # initial_p=1.0, # final_p=self.exploration_final_eps) self.exploration = ConstantSchedule(self.exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() return 'initialize() complete' def transfer_pretrain(self, transferred_instances, epochs, tr_batch_size, keep_in_replay_buffer=True): """ This is a custom function from University of Toronto group to first pretrain the deepq train network with transferred instances. These instances must be zip([s],[a],[r],[s']) tuples mapped over to the same state and action spaces as the target task environment. No output - just updates parameters of train and target networks. """ # TODO - function that trains self.act and self.train using mapped instances done = False # pack all instances into replay buffer for obs, action, rew, new_obs in transferred_instances: self.replay_buffer.add(obs, action, rew, new_obs, float(done)) for epoch in range(epochs): obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( tr_batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) self.update_target() if keep_in_replay_buffer is not True: self.replay_buffer = ReplayBuffer(self.buffer_size) return 'transfer_pretrain() complete' def task_train(self): self.episode_rewards = [0.0] self.episode_steps = [0.0] self.saved_mean_reward = None obs = self.env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(self.max_timesteps): if self.callback is not None: if self.callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - self.exploration.value(t) + self.exploration.value(t) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs self.episode_rewards[-1] += rew self.episode_steps[-1] += 1 if done: obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_steps.append(0.0) reset = True if t > self.learning_starts and t % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if t > self.learning_starts and t % self.target_network_update_freq == 0: # Update target network periodically. self.update_target() mean_100ep_reward = round( np.mean(self.episode_rewards[-101:-1]), 1) num_episodes = len(self.episode_rewards) if done and self.print_freq is not None and len( self.episode_rewards) % self.print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(t))) logger.dump_tabular() if (self.checkpoint_freq is not None and t > self.learning_starts and num_episodes > 100 and t % self.checkpoint_freq == 0): if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward: if self.print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(self.saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True self.saved_mean_reward = mean_100ep_reward if num_episodes >= self.max_episodes: break if model_saved: if self.print_freq is not None: logger.log("Restored model with mean reward: {}".format( self.saved_mean_reward)) U.load_state(model_file) return self.act, self.episode_rewards, self.episode_steps def get_q_values(self, obs): ''' Input: obs should be a numpy array with shape (?,state_space) Output: returns Q values for each possible action with shape (?,action_space) ''' return self.debug['q_values'](obs)
def learn(self): act = self.act train = self.train update_target = self.update_target env = self.env with self.session.as_default(): replay_buffer = ReplayBuffer(self._replay_buffer_size) exploration = LinearSchedule( schedule_timesteps=self._exploration_schedule_steps, initial_p=self._exploration_initial_prob, final_p=self._exploration_final_prob) tf_util.initialize() update_target() episode_rewards = [0.0] episode_errors = [] episode_rw_errors = [] episode_error_diffs = [] observation = env.reset() cnt = itertools.count() for t in itertools.count(): # print("iter: ", t) # Take action and update exploration to the newest value action = act(observation[None], update_eps=exploration.value(t))[0] new_observation, reward, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(observation, action, reward, new_observation, float(done)) observation = new_observation episode_rewards[-1] += reward if done: episode_errors.append(env.error) episode_rewards.append(0) if self._random_walk_sampling_args is not None: sampling_args = self._random_walk_sampling_args sampling_args.update({"graph": env.graph}) rw_error = random_walk_error(sampling_args) episode_rw_errors.append(rw_error) episode_error_diffs.append(rw_error - env.error) if len(episode_rewards) % 10 == 0: nmse = env.get_current_nmse() logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 3)) logger.record_tabular( "mean episode error", round(np.mean(episode_errors[-101:-1]), 3)) logger.record_tabular("nmse", nmse) logger.record_tabular( "sampling set", [int(v) for v in env.sampling_set]) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) if self._random_walk_sampling_args is not None: logger.record_tabular( "mean random walk error", round(np.mean(episode_rw_errors[-101:-1]), 3)) logger.record_tabular( "mean error diff", round(np.mean(episode_error_diffs[-101:-1]), 3)) logger.dump_tabular() observation = env.reset() # Minimize the Bellman equation error on replay buffer sample batch if t > 1000: (observations_t, actions, rewards, observations_tp1, dones) = replay_buffer.sample(32) train(observations_t, actions, rewards, observations_tp1, dones, np.ones_like(rewards)) if t % 1000 == 0: # Update target network periodically. update_target()
def learn( env, max_timesteps=50000000, # Human level control hyperparameters batch_size=32, buffer_size=1000000, agent_history_length=4, target_network_update_freq=10000, discount_factor=0.99, # "action_repeat=4" handled by gym environment(equivalent to frame skip) train_freq=4, # agent "update frequency" in human level control paper initial_exploration_rate=1, final_exploration_rate=0.1, final_exploration_frame=1000000, replay_start_size=50000, print_freq=10, checkpoint_freq=100, episode_render_freq=None, log_dir='./tensorboard', start_from_checkpoint=False): writer = tf.summary.FileWriter(log_dir + '/' + env.spec.id) # Linear decay as used in the deepmind paper epsilon = lambda t: max( initial_exploration_rate - (t / final_exploration_frame), final_exploration_rate) preprocess = _preprocess if len( env.observation_space.shape) == 3 else lambda x: x replay_buffer = ReplayBuffer(buffer_size) num_actions = env.action_space.n # Here, we'll use a simple feed forward nn for representing # Q(s) -> [r_1, r_2, ..., r_n] where r_k is the reward for taking action # `k` in state `s` if start_from_checkpoint: model = load_model('tmp_model', custom_objects={'huber_loss': huber_loss}) else: model = q_nn(env.observation_space, num_actions, agent_history_length) target_model = clone_model(model) # Keep some state about the current episode num_episodes = 0 episode_total_reward = 0 episode_timesteps = 0 episode_rewards = [0.0] last_checkpoint_mean_reward = -inf mean_100ep_reward = -inf # Start off with a fresh environment ob = preprocess(env.reset()) obs = [ob for i in range(agent_history_length)] # Play breakout for max_timesteps for t in range(max_timesteps): # With probability epsilon, take a random action if (random.uniform(0, 1) < epsilon(t)): action = env.action_space.sample() else: observations = np.array([obs]) actions = np.reshape(np.ones(num_actions), [1, -1]) q_values = model.predict_on_batch([observations, actions]) action = np.argmax(q_values, axis=1)[0] # Collect observations and store them for replay new_ob, reward, is_done, info = env.step(action) is_done = info['ale.lives'] != 5 new_obs = list(obs) new_obs.pop(0) new_obs.append(preprocess(new_ob)) replay_buffer.add(obs, action, reward, new_obs, is_done) obs = new_obs # Update logging info episode_total_reward += reward episode_timesteps += 1 if t > replay_start_size and t % train_freq == 0: fit_batch(model, target_model, num_actions, discount_factor, replay_buffer.sample(batch_size), writer, t // train_freq) if t > replay_start_size and t % target_network_update_freq == 0: # Must checkpoint model and clear sess to avoid OOM https://github.com/keras-team/keras/issues/5345 model.save('tmp_model') K.clear_session() target_model = load_model( 'tmp_model', custom_objects={'huber_loss': huber_loss}) model = load_model('tmp_model', custom_objects={'huber_loss': huber_loss}) print('Setting model to target model') if is_done: ob = preprocess(env.reset()) obs = np.array([ob for i in range(agent_history_length)]) episode_timesteps = 0 num_episodes += 1 episode_rewards.append(episode_total_reward) episode_total_reward = 0 if len(episode_rewards) > 100: episode_rewards.pop(0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) if is_done and num_episodes % print_freq == 0: print("timesteps", t) print("episodes run", num_episodes) print("last episode reward", episode_rewards[-1]) print("mean_100ep_reward", mean_100ep_reward) print("% time spent exploring", int(100 * epsilon(t))) if t % checkpoint_freq == 0 and mean_100ep_reward > last_checkpoint_mean_reward: print("Saving model due to mean reward increase: ", last_checkpoint_mean_reward, " -> ", mean_100ep_reward) model.save('models/' + env.spec.id + '_deepq.h5py') last_checkpoint_mean_reward = mean_100ep_reward if episode_render_freq is not None and num_episodes % episode_render_freq == 0: env.render()
def learn(self): with U.make_session(8): # Create the environment env = gym.make(self._args.env) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput( env.observation_space, name=name), q_func=self.model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer( learning_rate=self._args.learning_rate), ) # Create the replay buffer replay_buffer = ReplayBuffer(self._args.replay_buffer_size) # Create the schedule for exploration starting from 1 till min_exploration_rate. exploration = LinearSchedule( schedule_timesteps=self._args.exploration_duration, initial_p=1.0, final_p=self._args.min_exploration_rate) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) mean_episode_reward = np.mean(episode_rewards[-101:-1]) # Show learned agent: if mean_episode_reward >= self._render_reward_threshold: env.render() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: self._reward_buffer_mutex.acquire() self._reward_buffer.append(mean_episode_reward) logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(mean_episode_reward, 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() self._reward_buffer_changed = True self._reward_buffer_mutex.release()
def main(): print('main') stats_file = pathlib.Path('stats.csv') if stats_file.exists(): stats_file.unlink() broker = dqn.env.Broker('http://localhost:5000') env = dqn.env.HaliteEnv(broker) with U.make_session(num_cpu=4): observation_shape = env.observation_space.shape def make_obs_ph(name): import dqn.tf_util as U return U.BatchInput(observation_shape, name=name) # Create all the functions necessary to train the model act, train, update_target, debug = dqn.graph.build_train( make_obs_ph=make_obs_ph, q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) act = dqn.play.ActWrapper( act, { 'make_obs_ph': make_obs_ph, 'q_func': model, 'num_actions': env.action_space.n, }) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=30000, initial_p=1.0, final_p=0.03) # Initialize the parameters and copy them to the target network. U.initialize() update_target() learning_starts = 1000 target_network_update_freq = 500 checkpoint_freq = 20 episode_rewards = [0.0] wins = [False] saved_mean_reward = None obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, info = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) wins.append(info['win']) win_rate = round(np.mean(wins[-100:]), 4) is_solved = t > 100 and win_rate >= 99 if is_solved: print('solved') break else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > learning_starts: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) actions = np.argmax(actions, axis=1) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 4) num_episodes = len(episode_rewards) exploration_rate = int(100 * exploration.value(t)) if done: info = { 'date': str(dt.datetime.now()), 'episode': len(episode_rewards), **info, 'win_rate': win_rate, 'mean_100ep_reward': mean_100ep_reward, 'exploration_rate': exploration_rate, } print('episode', info) if not stats_file.exists(): with stats_file.open('w') as fp: fp.write(','.join(info.keys()) + '\n') with stats_file.open('a') as fp: fp.write(','.join(map(str, info.values())) + '\n') if done and num_episodes % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("mean win rate", win_rate) logger.record_tabular("% time spent exploring", exploration_rate) logger.dump_tabular() if done and (t > learning_starts and num_episodes > 100 and num_episodes % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) act.save('dqn_model.pkl') saved_mean_reward = mean_100ep_reward act.save('dqn_model.pkl') env.close()
def startTraining(): # Create the environment print('START ENV', RC.GB_CLIENT_ID(), RC.gbRobotHandle()) env = RobotOperationEnvironment(RC.GB_CLIENT_ID(), RC.GB_CSERVER_ROBOT_ID, RC.gbRobotHandle()) #print('ACTION_SPACE', env.action_space.shape) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() print("Manipulator DEEPQ Training Experiment Start.") for t in itertools.count(): print('Episode ', len(episode_rewards), 'Step ', t, '--------------') print('Start waiting for the next action', env._robot.getOperationState()) while (env._robot.getOperationState() != RC.CROBOT_STATE_READY): time.sleep(0.01) # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] print('Generated action:', action) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result #env.render() pass else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) print('Generated actions:', actions) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
class DQNAgent: def __init__(self, identifier, actions, observation_shape, num_steps, x=0.0, y=0.0): self.id = identifier self.actions = actions self.x = x self.y = y self.yellow_steps = 0 self.postponed_action = None self.obs = None self.current_action = None self.weights = np.ones(32) self.td_errors = np.ones(32) self.pre_train = 2500 self.prioritized = False self.prioritized_eps = 1e-4 self.batch_size = 32 self.buffer_size = 30000 self.learning_freq = 500 self.target_update = 5000 # Create all the functions necessary to train the model self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=lambda name: TrafficTfInput(observation_shape, name=name), q_func=dueling_model, num_actions=len(actions), optimizer=tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4), gamma=0.99, double_q=True, scope="deepq" + identifier ) # Create the replay buffer if self.prioritized: self.replay_buffer = PrioritizedReplayBuffer(size=self.buffer_size, alpha=0.6) self.beta_schedule = LinearSchedule(num_steps // 4, initial_p=0.4, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). self.exploration = LinearSchedule(schedule_timesteps=int(num_steps * 0.1), initial_p=1.0, final_p=0.01) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() def take_action(self, t): if self.postponed_action is None: # Take action and update exploration to the newest value action = self.act(np.array(self.obs)[None], update_eps=self.exploration.value(t))[0] else: # Take action postponed by yellow light transition action = self.postponed_action self.postponed_action = None return action def store(self, rew, new_obs, done): # Store transition in the replay buffer. self.replay_buffer.add(self.obs, self.current_action, rew, new_obs, float(done)) def learn(self, t): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > self.pre_train: if self.prioritized: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, self.weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) self.weights = np.ones_like(rewards) # Minimize the error in Bellman's equation and compute TD-error self.td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, self.weights) # Update the priorities in the replay buffer if self.prioritized: new_priorities = np.abs(self.td_errors) + self.prioritized_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) self.update_target_network(t) def update_target_network(self, t): # Update target network periodically. if t % self.target_update == 0: self.update_target() def add_fingerprint_to_obs(self, obs, weights, identifier, td_errors): idx = 0 for w in weights: obs[2, identifier, idx] = w idx += 1 for td in td_errors: obs[2, identifier, idx] = td idx += 1 return obs def add_fingerprint(self, weights, identifier, td_errors): self.obs = self.add_fingerprint_to_obs(self.obs, weights, identifier, td_errors)
def learn(env, q_func, policy_fn, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) scope = "ampi" reuse=None grad_norm_clipping=None num_actions=env.action_space.n optimizer_q=tf.train.AdamOptimizer(learning_rate=lr) optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr) act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # add ob_space = env.observation_space ac_space = env.action_space pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func") pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # Q_{train}(a,s) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # y_j act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1}) q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1})) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # Regression loss td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # argmax_a Q_{target}(s_j, a) z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a') # classification loss cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=z_j) # Q optimization if grad_norm_clipping is not None: gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients_qq): if grad is not None: gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_q = optimizer_q.apply_gradients(gradients_q) else: optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars) # pi optimization if grad_norm_clipping is not None: gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars) for i, (grad, var) in enumerate(gradients_pi): if grad is not None: gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_pi = optimizer_pi.apply_gradients(gradients_pi) else: optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars) # update_target Q update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_target pi update_target_pi = [] for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name), sorted(target_pi_func_vars, key=lambda v: v.name)): update_target_pi.append(var_target.assign(var)) update_target_pi = tf.group(*update_target_pi) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, cl_error], updates=[optimize_q, optimize_pi] ) update_target = U.function([], [], updates=[update_target_expr, update_target_pi]) q_values = U.function([obs_t_input], q_t) debug = {'q_values': q_values} # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = env.action_space.sample() # not used, just so we have the datatype stochastic=True ac1, vpred1 = act(stochastic, np.array(obs)[None]) action = ac1[0] #action, _ = pi.act(stochastic, obs) #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log train and res mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def train(model_file, game="CartPole-v1"): """Train at a game.""" with tf_util.make_session(8): env = gym.make(game) def make_placeholder(name): """Make a placeholder input.""" return tf_util.BatchInput(env.observation_space.shape, name=name) act_params = { 'make_obs_ph': make_placeholder, 'q_func': model, 'num_actions': env.action_space.n } act, train, update_target, debug = deepq.build_train( **act_params, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) act = ActWrapper(act, act_params) replay_buffer = ReplayBuffer(50000) exploration = LinearSchedule( schedule_timesteps=100000, initial_p=1.0, final_p=0.02 ) tf_util.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if not len(episode_rewards) % 100: env.render() if t > 1000: obses_t, actions, rewards, obses_tp1, dones = ( replay_buffer.sample(32) ) train( obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards) ) if not t % 1000: update_target() if not t % 3000: if model_file: tf_util.save_state(model_file) yield act if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1) ) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(t)) ) logger.dump_tabular()
def train_dqn(opts, seed=None, lr=1e-3, total_timesteps=500000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, checkpoint_freq=500000, learning_starts=1000, gamma=1.000, target_network_update_freq=3000, load_path=None): """ Runs the main recorder by binding certain discrete actions to keys. """ if os.path.exists(opts.model_dir): print('Path already exists. Remove? y for yes') input_char = getch.getch() if not input_char == 'y': print('Exiting') return shutil.rmtree(opts.model_dir) os.makedirs(opts.model_dir) os.makedirs(os.path.join(opts.model_dir, 'logs')) os.makedirs(os.path.join(opts.model_dir, 'weights')) #env = gym.make('MountainCar-v0') env = gym.make('LunarLander-v2') env._max_episode_steps = 1200 sess = get_session() set_global_seeds(seed) train_writer = tf.summary.FileWriter(os.path.join(opts.model_dir, 'logs'), sess.graph) q_func = build_q_func('mlp') # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) replay_buffer = ReplayBuffer(buffer_size) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in range(total_timesteps): # Take action and update exploration to the newest value env.render() update_eps = exploration.value(t) action = act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: print("Exploration value: {}".format(exploration.value(t))) print("Last 25 episode rewards: {}".format(episode_rewards[-25:])) reward_summary = tf.Summary(value=[ tf.Summary.Value(tag='reward', simple_value=episode_rewards[-1]) ]) train_writer.add_summary(reward_summary, t) obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors, summary = train(obses_t, actions, rewards, obses_tp1, dones, weights) train_writer.add_summary(summary, t) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() if t > learning_starts and t % checkpoint_freq == 0: save_variables( os.path.join(opts.model_dir, 'weights', '{}.model'.format(t))) save_variables(os.path.join(opts.model_dir, 'weights', 'last.model'))
def sobolev_learn_episode( env, q_func, lr=5e-4, max_episodes=1000, buffer_size=50000, epsilon=.1, #exploration_fraction=0.1, #exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, alpha=1.0, grad_norm_clipping=10.0): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_sobolev_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=grad_norm_clipping, param_noise=param_noise, alpha=alpha) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer ''' if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) ''' replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None exploration = ConstantSchedule(epsilon) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] episode_lengths = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") e = 0 # num of current episode t = 0 # timestep while e < max_episodes: if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew episode_lengths[-1] += 1 if done: obs = env.reset() episode_rewards.append(0.0) episode_lengths.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) # increment counters t += 1 # increment timestep if done: e += 1 # increment episode if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
if is_solved: # Capture N samples and save them into a csv file env.render() if len(exp_demo) < N: temp_list = list(obs) # temp_list.append(done) # temp_list.append(action) exp_demo.append(temp_list) else: with open('mentor_demonstrations_NN.csv', 'w', newline='') as csvfile: data_writer = csv.writer(csvfile, delimiter=',') for row in exp_demo: data_writer.writerow(row) break else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones, ment_obs, ment_obs_tp1, ment_act = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
class DQN(BaseAgent): def __init__(self, env, name='default', alg_name='dqn', network_type='mini-mlp', total_timesteps=5e7, batch_size=32, lr=1e-3, gamma=0.99, buffer_size=1e6, final_eps=0.05, exploration_fraction=0.1, training_start=1e5, target_update_freq=1e4, optimizer=tf.train.AdamOptimizer, gradient_clipping=None, reward_clipping=False, tau=1., double_q=False, dueling=False, prioritized_replay=False, prioritized_replay_alpha=0.5, prioritized_replay_beta_init=0.4, prioritized_replay_beta_fraction=1.0, prioritized_replay_eps=1e-6, rolling_reward_mean=20, solved_callback=None, render_training=False, **kwargs): """ Implementation of the Deep Q Learning (DQN) algorithm formulated by Mnih et. al. Contains some well known improvements over the vanilla DQN. Parameters ---------- env: gym.Environment (gym) Environment the agent shall learn from and act on name: str descriptive name of this DQN configuration, e.g. 'atari-breakout' network_type: str which network is from 'networks.py' total_timesteps: int or float number of training timesteps batch_size: int size of minibatch per backprop lr: float learning rate gamma: float discount factor gamma for bellman target buffer_size: int or float maximum number of in replay buffer final_eps: float value to which epsilon is annealed exploration_fraction: float fraction of traing timesteps over which epsilon is annealed training_start: int timestep at which training of the q network begins target_update_freq: int frequency of target network updates (in timesteps) optimizer: tf.Optimizer optimizer class which shall be used such as Adam or RMSprop gradient_clipping: int if not None, gradients are clipped by this value by norm reward_clipping: float rewards will be clipped to this value if not None tau: float interpolation constant for soft update. 1.0 corresponds to a full synchronisation of networks weights, as in the original DQN paper double_q: bool enables Double Q Learning for DQN dueling: bool splits network architecture into advantage and value streams. V(s, a) gets more frequent updates, should stabalize learning prioritized_replay: True use (proportional) prioritized replay prioritized_replay_alpha: float alpha for weighting priorization prioritized_replay_beta_init: float initial value of beta for prioritized replay buffer prioritized_replay_beta_fraction: float fraction of total timesteps to anneal beta to 1.0 prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. rolling_reward_mean: int window of which the rolling mean in the statistics is computed solved_callback: function function which gets as an input the episode rewards as an array and must return a bool. if returned True, the training is considered as done and therefore prematurely interrupted. render_training: bool whether to render the environment while training """ # instance name self.name = name # environment to act on / learn from self.env = env # basic DQN parameters self.total_timesteps = float(total_timesteps) self.buffer_size = int(float(buffer_size)) self.batch_size = batch_size self.final_eps = final_eps self.lr = float(lr) self.gamma = float(gamma) self.exploration_fraction = float(exploration_fraction) self.training_start = int(float(training_start)) self.target_update_freq = int(float(target_update_freq)) # tf.Optimizer self.optimizer = optimizer # minor changes as suggested in some papers self.gradient_clipping = int( gradient_clipping) if gradient_clipping is not None else None self.reward_clipping = int( reward_clipping) if reward_clipping is not None else None # enhancements to DQN published in papers self.tau = float(tau) self.double_q = double_q self.dueling = dueling self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = float(prioritized_replay_alpha) self.prioritized_replay_beta_init = float(prioritized_replay_beta_init) self.prioritized_replay_beta_fraction = float( prioritized_replay_beta_fraction) self.prioritized_replay_eps = float(prioritized_replay_eps) # function to determine whether agent is able to act well enough self.solved_callback = solved_callback # call env.render() each training step self.render_training = render_training # sliding window for reward calc self.rolling_reward_mean = rolling_reward_mean # stores latest measure for best policy, e.g. best mean over last N episodes self.latest_best = 0.0 super().__init__(env, alg_name, name, **kwargs) # calculate timestep where epsilon reaches its final value self.schedule_timesteps = int(self.total_timesteps * self.exploration_fraction) # sanity checks assert 0.0 < self.tau <= 1.0 # env specific parameter self.obs_shape = env.observation_space.shape self.num_actions = env.action_space.n # tf scopes self.Q_SCOPE = 'q_network' self.TARGET_SCOPE = 'target_network' # build Q and target network; using different scopes to distinguish variables for gradient computation self.q_t_in, self.q_t = build_network(self.obs_shape, self.num_actions, network_type=network_type, dueling=self.dueling, scope=self.Q_SCOPE, summaries=True) self.target_tp1_in, self.target_tp1 = build_network( self.obs_shape, self.num_actions, dueling=self.dueling, network_type=network_type, scope=self.TARGET_SCOPE) # double Q learning needs to pass observations t+1 to the q networks for action selection # so we reuse already created q network variables but with different input if self.double_q: self.q_tp1_in, self.q_tp1 = build_network( self.obs_shape, self.num_actions, dueling=self.dueling, network_type=network_type, scope=self.Q_SCOPE, reuse=True) # create replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # list of variables of the different networks. required for copying # Q to target network and excluding target network variables from backprop self.q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Q_SCOPE) self.target_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.TARGET_SCOPE) # placeholders used in loss function self._L_r = tf.placeholder(tf.float32, (None, ), name='loss_rewards') self._L_a = tf.placeholder(tf.int32, (None, ), name='loss_actions') self._L_d = tf.placeholder(tf.float32, (None, ), name='loss_dones') # pointer to td error vector self._td_errors = tf.placeholder(tf.float32, (None, ), name='td_errors') # configure prioritized replay if self.prioritized_replay: self._is_weights = tf.placeholder( tf.float32, (None, ), name='importance_sampling_weights') # schedule for PR beta beta_steps = int(self.total_timesteps * self.prioritized_replay_beta_fraction) self.pr_beta = LinearSchedule( beta_steps, initial_p=prioritized_replay_beta_init, final_p=1.0) # epsilon schedule self.eps = LinearSchedule(self.schedule_timesteps, final_p=final_eps) # init optimizer self.opt = self.optimizer(self.lr) # specify loss function, only include Q network variables for gradient computation self.gradients = self.opt.compute_gradients(self._loss(), var_list=self.q_net_vars) # clip gradients by norm if self.gradient_clipping is not None: for idx, (grad, var) in enumerate(self.gradients): if grad is not None: self.gradients[idx] = (tf.clip_by_norm( grad, self.gradient_clipping), var) # create training op self.train_op = self.opt.apply_gradients(self.gradients) # update_target_fn will be called periodically to copy Q network to target Q network # variable lists are sorted by name to ensure that correct values are copied self.update_target_ops = [] for var_q, var_target in zip( sorted(self.q_net_vars, key=lambda v: v.name), sorted(self.target_net_vars, key=lambda v: v.name)): v_update = var_target.assign(self.tau * var_q + (1 - self.tau) * var_target) self.update_target_ops.append(v_update) self.update_target_ops = tf.group(*self.update_target_ops) # global tf.Session and Graph init self.sess = tf.Session() # init tensorboard, variables and debug self._finalize_init() # sync networks before training self.sess.run(self.update_target_ops) def _setup_tensorboard(self): """ Adds all variables that might help debugging to Tensorboard. At the end, the FileWriter is constructed pointing to the specified directory. """ # more placeholders for summarised variables; along with summaries self.eps_ph = tf.placeholder(tf.float32, (), name='epsilon') self.rew_ph = tf.placeholder(tf.float32, (), name='rolling-reward') scalar_summary('epsilon', self.eps_ph) scalar_summary('reward', self.rew_ph) # display q_values while training for a_i in range(self.num_actions): scalar_summary('QTa_{}'.format(a_i + 1), tf.reduce_mean(self.target_tp1[:, a_i]), scope='Q-Values') scalar_summary('Qa_{}'.format(a_i + 1), tf.reduce_mean(self.q_t[:, a_i]), scope='Q-Values') # plot network weights with tf.variable_scope('weights'): for qv in self.q_net_vars: tf.summary.histogram('{}'.format(qv.name), qv) for tv in self.target_net_vars: tf.summary.histogram('{}'.format(tv.name), tv) # gradient histograms with tf.variable_scope('gradients'): for g in self.gradients: tf.summary.histogram('{}-grad'.format(g[1].name), g[0]) def _loss(self): """ Defines loss as layed out in the original Nature paper """ with tf.variable_scope('loss'): # either use maximum target q or use value from target network while the action is chosen by the q net if self.double_q: act_tp1_idxs = tf.stop_gradient(tf.argmax(self.q_tp1, axis=1)) q_tp1 = tf.reduce_sum( self.target_tp1 * tf.one_hot(act_tp1_idxs, self.num_actions), axis=1) else: q_tp1 = tf.reduce_max(self.target_tp1, axis=1) # bellman target y = self._L_r + (self.gamma * (1.0 - self._L_d) * q_tp1) # select q value of taken action qj = tf.reduce_sum(self.q_t * tf.one_hot(self._L_a, self.num_actions), axis=1) # TD errors self._td_errors = qj - y # apply huber loss loss = tf.losses.huber_loss(y, qj) if self.use_tensorboard: scalar_summary('target', tf.reduce_mean(y)) scalar_summary('huber-loss', tf.reduce_mean(loss)) tf.summary.histogram('selected_Q', qj) # importance sampling weights if self.prioritized_replay: updates = tf.reduce_mean(self._is_weights * loss) else: updates = tf.reduce_mean(loss) return updates def _build_feed_dict(self, obs_t, ac_t, rew_t, obs_tp1, dones, eps, rolling_rew, weights=None): """ Takes minibatch and returns feed dict for a tf.Session based on the algorithms configuration. """ # first, add data required in all DQN configs feed_d = { self.q_t_in: obs_t, self.target_tp1_in: obs_tp1, self._L_r: rew_t, self._L_a: ac_t, self._L_d: dones } # pass obs t+1 to q network if self.double_q: feed_d[self.q_tp1_in] = obs_tp1 # importance sampling weights if self.prioritized_replay: feed_d[self._is_weights] = weights # variables only necessary for TensorBoard visualisation if self.use_tensorboard: feed_d[self.eps_ph] = eps feed_d[self.rew_ph] = rolling_rew return feed_d def learn(self): """ Learns Q function for a given amount of timesteps """ # reset env, store first observation obs_t = self.env.reset() # save all episode rewards episode_reward_series = [[0.0]] episode_rewards = [] self.logger.info( 'Starting Exploration, training will start at step {}.'.format( self.training_start)) for t in tqdm(range(int(self.total_timesteps))): # decide on action either by policy or chose a random one epsilon = self.eps.value(t) _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon]) if _rand: action = self.env.action_space.sample() else: action = np.argmax(self.sess.run(self.q_t, {self.q_t_in: [obs_t]}), axis=1) assert len(action) == 1, 'only one action can be taken!' action = action[0] # act on environment with chosen action obs_tp1, reward, done, _ = self.env.step(action) # clip reward if self.reward_clipping: reward = 1 if reward > 0 else -1 if reward < 0 else 0 # store new transition self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done)) # new observation will be current one in next iteration obs_t = obs_tp1 # append current rewards to episode reward series episode_reward_series[-1].append(reward) if self.render_training: self.env.render() if t == self.training_start: self.logger.info('Training starts now! (t = {})'.format(t)) # final calculations and env reset if done: # calculate total reward episode_rewards.append(np.sum(episode_reward_series[-1])) episode_reward_series.append([0.0]) # reset env to initial state obs_t = self.env.reset() # start training after warmup period if t >= self.training_start: # calculate rolling reward rolling_r = np.mean(episode_rewards[-self.rolling_reward_mean:] ) if len(episode_rewards) > 0 else 0.0 # post episode stuff: printing and saving if done: result_table = [['t', t], ['episode', len(episode_rewards)], ['mean_reward [20]', rolling_r], ['epsilon', epsilon]] print('\n{}'.format(tabulate(result_table))) # if the policy improved, save as new best ... achieving a good reward in one episode # might not be the best metric. continuously achieving good rewards would better if len(episode_rewards) >= 25: mr = np.mean( episode_rewards[-self.rolling_reward_mean:]) if mr >= self.latest_best: self.latest_best = mr self.logger.info( 'Saving new best policy with mean[{}]_r = {} ...' .format(self.rolling_reward_mean, mr)) self._save('best') # save latest policy self._save() # write current values to csv log self.csvlog.write('{}, {}, {}\n'.format( len(episode_rewards), epsilon, episode_rewards[-1])) # sample batch of transitions randomly for training and build feed dictionary # prioritized replay needs a beta and returns weights. if self.prioritized_replay: o_t, a_t, r_t, o_tp1, do, is_ws, batch_idxs = self.replay_buffer.sample( self.batch_size, self.pr_beta.value(t)) feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do, epsilon, rolling_r, weights=is_ws) else: o_t, a_t, r_t, o_tp1, do = self.replay_buffer.sample( self.batch_size) feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do, epsilon, rolling_r) # run training (and summary) operations if self.use_tensorboard: summary, _, td_errors = self.sess.run( [self.merge_op, self.train_op, self._td_errors], feed_dict=feed) self.writer.add_summary(summary, t) else: self.sess.run(self.train_op, feed_dict=feed) # new td errors needed to update buffer weights if self.prioritized_replay: new_prios = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxs, new_prios) # sync target network every C steps if (t - self.training_start) % self.target_update_freq == 0: self.sess.run(self.update_target_ops) if self.solved_callback is not None: if self.solved_callback(episode_rewards): self.logger.info('Solved!') break # total reward of last episode episode_rewards.append(np.sum(episode_reward_series[-1])) # finalize training, e.g. set flags, write done-file self._finalize_training() def run(self, render=True): """ Runs policy on given environment """ if not self.is_trained: self.logger.warning('Trying to run untrained model!') # set necessary parameters to their defaults epsilon = self.final_eps reward = 0.0 obs = self.env.reset() while True: # decide on action either by policy or chose a random one _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon]) if _rand: action = self.env.action_space.sample() else: action = np.argmax(self.sess.run(self.q_t, {self.q_t_in: [obs]}), axis=1) assert len(action) == 1, 'only one action can be taken!' action = action[0] # act on environment with chosen action obs, rew, done, _ = self.env.step(action) reward += rew if render: self.env.render() if done: self.logger.info('Done! Reward {}'.format(reward)) reward = 0.0 obs = self.env.reset()
def main(): with U.make_session(8): env = gym.make("Pendulum-v0") act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): env.render() # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
def main(): MAX_BUFFER_SIZE = 100000 MAX_EPISODES = 10000 TRAIN_EPISODE = 100 TARGET_UPDATE_EPS = 1000 batch_size = 32 n_size = 84 discount = 0.99 checkpoint_dir = './checkpoints' save_file_name = 'mario_weight_2.ckpt' # 1. Create gym environment env = gym.make("ppaquette/SuperMarioBros-1-1-v0") # 2. Apply action space wrapper env = MarioActionSpaceWrapper(env) # 3. Apply observation space wrapper to reduce input size env = ProcessFrame84(env) #replay_buffer = PrioritizedReplayBuffer(MAX_BUFFER_SIZE, alpha=prioritized_replay_alpha) replay_buffer = ReplayBuffer(MAX_BUFFER_SIZE) sess = tf.Session() mainDQN = DQN(sess, name="main") targetDQN = DQN(sess, name="target") dqn_var_list = targetDQN.var_list sess.run(tf.global_variables_initializer()) copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) saver = tf.train.Saver(var_list=dqn_var_list) for eps in range(MAX_EPISODES): # decaying epsilon greedy e = 1. / ((eps / 10) + 1) done = False step_count = 0 state = env.reset() state_queue = deque(maxlen=4) next_state_queue = deque(maxlen=4) state_queue.append(state) next_state_queue.append(state) prev_100 = 0 curr_100 = 0 while not done: step_count += 1 # cumulate 4 frames if step_count < 4: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state_queue.append(next_state) next_state_queue.append(next_state) continue # training starts if np.random.rand() < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network action = np.argmax( mainDQN.predict( np.reshape(np.array(state_queue), [1, n_size, n_size, 4]))) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # Penalty reward = -100 curr_100 += reward next_state_queue.append(next_state) replay_buffer.add(np.array(state_queue), action, reward, np.array(next_state_queue), done) if step_count % TRAIN_EPISODE == 0: states, actions, rewards, next_states, _ = replay_buffer.sample( batch_size) states, next_states = np.reshape( states, [batch_size, n_size, n_size, 4]), np.reshape( next_states, [batch_size, n_size, n_size, 4]) Q_t = targetDQN.predict(next_states) Q_m = mainDQN.predict(states) Q_t = np.max(Q_t, axis=1) estimates = rewards + discount * Q_t Q_m[np.arange(batch_size), actions] = estimates loss = mainDQN.update(states, Q_m) print("eps: {} step: {} loss: {}".format( eps, step_count, loss)) if curr_100 > prev_100: save_path = saver.save( sess, os.path.join(checkpoint_dir, save_file_name)) print("Model saved in file: %s" % save_path) prev_100 = curr_100 curr_100 = 0 if step_count % TARGET_UPDATE_EPS == 0: sess.run(copy_ops) state_queue.append(next_state)
action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result env.render() else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
class MemBufferThread(threading.Thread): # 注意可变参数概念 def __init__(self, mem_queue, max_timesteps=1000000, buffer_size=50000, batch_size=32, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): threading.Thread.__init__(self) self.mem_queue = mem_queue self.prioritized_replay = prioritized_replay self.batch_size = batch_size self.batch_idxes = None self.prioritized_replay_eps = prioritized_replay_eps # Create the replay buffer if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) self.beta_schedule = None def __len__(self): return self.replay_buffer.__len__() def sample(self, t): if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t)) # 这个t的取值有待商议, (obses_t, actions, rewards, obses_tp1, dones, weights, self.batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) # np.ones_like() : Return an array of ones with the same shape and type as a given array. weights, self.batch_idxes = np.ones_like(rewards), None return obses_t, actions, rewards, obses_tp1, dones, weights def update_priorities(self, td_errors): new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(self.batch_idxes, new_priorities) def run(self): # flag = 1 while True: if self.mem_queue.full() is True: print("the mem_queue is full") # if self.replay_buffer.__len__() >= 100000 and self.replay_buffer.__len__() % 100 == 0: # bool(flag): # # print("replay_buffer is 100000 !") # print('') # flag = 0 if self.mem_queue.empty() is not True: single_mem = self.mem_queue.get() self.replay_buffer.add(single_mem[0], single_mem[1], single_mem[2], single_mem[3], single_mem[4])
and info['ale.lives'] > 0) prev_lives = info['ale.lives'] replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) obs = new_obs episode_rewards[-1] += rew if done: log.add_scalar('reward', episode_rewards[-1], num_iters) episode_rewards.append(0.0) obs = env.reset() num_episodes += 1 if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( args.batch_size) # Reshape state to (batch, channels, x_dim, y_dim) obses_t = np.transpose(obses_t, [0, 3, 1, 2]) obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) # TODO td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones) td_errors_list.append(td_errors.item()) log.add_scalar('td_error', td_errors.item(), num_iters) num_updates += 1 # Update target network. if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: # TODO
def evaluate(self, num_episodes, render=False): with U.make_session(NUM_CORES): self.t0 = time.time() env = self.env.env # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() self.episode_count += 1 state = env.reset() self.scores = [0.0] episode_q = [] for t in itertools.count(): action = act(state[None], update_eps=exploration.value(t))[0] observation, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, observation, float(done)) state = observation self.scores[-1] += reward episode_q.append(float(debug['q_values'](state[None]).max())) if render: env.render() if done: print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:]))) self.evaluation.info['q_values'].append(np.mean(episode_q)) if len(self.scores) >= num_episodes: return self.final_evaluation() state = env.reset() episode_q = [] self.scores.append(0) if self.env.solved(self.scores): self.evaluation.info['solved'] = len(self.scores) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() U.reset() return self.final_evaluation()
class DQNLearningAgent(Agent): def __init__( self, env, # observation_space, # action_space, network=None, scope='deepq', seed=None, lr=None, # Was 5e-4 lr_mc=5e-4, total_episodes=None, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=None, # was 0.02 train_freq=1, train_log_freq=100, batch_size=32, print_freq=100, checkpoint_freq=10000, # checkpoint_path=None, learning_starts=1000, gamma=None, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, save_path=None, load_path=None, save_reward_threshold=None, **network_kwargs): super().__init__(env, seed) if train_log_freq % train_freq != 0: raise ValueError( 'Train log frequency should be a multiple of train frequency') elif checkpoint_freq % train_log_freq != 0: raise ValueError( 'Checkpoint freq should be a multiple of train log frequency, or model saving will not be logged properly' ) print('init dqnlearningagent') self.train_log_freq = train_log_freq self.scope = scope self.learning_starts = learning_starts self.save_reward_threshold = save_reward_threshold self.batch_size = batch_size self.train_freq = train_freq self.total_episodes = total_episodes self.total_timesteps = total_timesteps # TODO: scope not doing anything. if network is None and 'lunar' in env.unwrapped.spec.id.lower(): if lr is None: lr = 1e-3 if exploration_final_eps is None: exploration_final_eps = 0.02 #exploration_fraction = 0.1 #exploration_final_eps = 0.02 target_network_update_freq = 1500 #print_freq = 100 # num_cpu = 5 if gamma is None: gamma = 0.99 network = 'mlp' network_kwargs = { 'num_layers': 2, 'num_hidden': 64, } self.target_network_update_freq = target_network_update_freq self.gamma = gamma get_session() # set_global_seeds(seed) # TODO: Check whether below is ok to substitue for set_global_seeds. try: import tensorflow as tf tf.set_random_seed(seed) except ImportError: pass self.q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.train_mc, self.update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=self.q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), optimizer_mc=tf.train.AdamOptimizer(learning_rate=lr_mc), gamma=gamma, grad_norm_clipping=10, param_noise=False, scope=scope, # reuse=reuse, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': self.q_func, 'num_actions': env.action_space.n, } self._act = ActWrapper(act, act_params) self.print_freq = print_freq self.checkpoint_freq = checkpoint_freq # Create the replay buffer self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha, ) if prioritized_replay_beta_iters is None: if total_episodes is not None: raise NotImplementedError( 'Need to check how to set exploration based on episodes' ) prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0, ) else: self.replay_buffer = ReplayBuffer(buffer_size) self.replay_buffer_mc = ReplayBuffer(buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int( exploration_fraction * total_timesteps if total_episodes is None else total_episodes), initial_p=1.0, final_p=exploration_final_eps, ) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.episode_lengths = [0] self.episode_rewards = [0.0] self.discounted_episode_rewards = [0.0] self.start_values = [None] self.lunar_crashes = [0] self.lunar_goals = [0] self.saved_mean_reward = None self.td = None if save_path is None: self.td = tempfile.mkdtemp() outdir = self.td self.model_file = os.path.join(outdir, "model") else: outdir = os.path.dirname(save_path) os.makedirs(outdir, exist_ok=True) self.model_file = save_path print('DQN agent saving to:', self.model_file) self.model_saved = False if tf.train.latest_checkpoint(outdir) is not None: # TODO: Check scope addition load_variables(self.model_file, scope=self.scope) # load_variables(self.model_file) logger.log('Loaded model from {}'.format(self.model_file)) self.model_saved = True raise Exception('Check that we want to load previous model') elif load_path is not None: # TODO: Check scope addition load_variables(load_path, scope=self.scope) # load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) self.train_log_file = None if save_path and load_path is None: self.train_log_file = self.model_file + '.log.csv' with open(self.train_log_file, 'w') as f: cols = [ 'episode', 't', 'td_max', 'td_mean', '100ep_r_mean', '100ep_r_mean_discounted', '100ep_v_mean', '100ep_n_crashes_mean', '100ep_n_goals_mean', 'saved_model', 'smoothing', ] f.write(','.join(cols) + '\n') self.training_episode = 0 self.t = 0 self.episode_t = 0 """ n = observation_space.n m = action_space.n self.Q = np.zeros((n, m)) self._lr_schedule = lr_schedule self._eps_schedule = eps_schedule self._boltzmann_schedule = boltzmann_schedule """ # Make placeholder for Q values self.q_values = debug['q_values'] def _log_training_details( self, episode=None, t=None, td_max=None, td_mean=None, r_mean=None, r_mean_discounted=None, v_mean=None, n_crashes_mean=None, n_goals_mean=None, saved_model=False, smoothing=False, ): if self.train_log_file is not None: with open(self.train_log_file, 'a+') as f: f.write('{}\n'.format(','.join([ str(episode), str(t), '{:.5f}'.format(td_max) if td_max is not None else '', '{:.5f}'.format(td_mean) if td_mean is not None else '', '{:.1f}'.format(r_mean) if r_mean is not None else '', '{:.1f}'.format(r_mean_discounted) if r_mean_discounted is not None else '', '{:.1f}'.format(v_mean) if v_mean is not None else '', '{:.1f}'.format(n_crashes_mean) if n_crashes_mean is not None else '', '{:.1f}'.format(n_goals_mean) if n_goals_mean is not None else '', str(int(saved_model)), str(int(smoothing)), ]))) def get_q_values(self, s): return self.q_values(s)[0] """ q_t = self.q_func( self.obs_t_input.get(), self.n_actions, scope='q_func', reuse=True, # reuse parameters from act ) Q = sess.run( Q_values, feed_dict={Q_obs: np.array(states)} ) raise NotImplementedError """ def act(self, s, explore, explore_eps=None): # Take action and update exploration to the newest value # get_session() obs = s if explore and explore_eps is None: update_eps = self.exploration.value( self.t if self.total_episodes is None else self. training_episode) elif explore: update_eps = explore_eps else: update_eps = 0 return self._act( np.array(obs)[None], update_eps=update_eps, )[0] def smooth( self, behavior_policy, evaluation_timesteps, max_k_random_actions=50, ): """Sample episodes to use for monte-carlo rollouts.""" obs = self.env.reset() ep = 0 episode_rewards = [] episode_states = [] episode_actions = [] # TODO: Don't hard-code, and bias towards smaller. def get_random_k_t(): k_random = self.np_random.randint(0, max_k_random_actions) random_t = self.np_random.randint(k_random, 200) return k_random, random_t k_random_actions, random_t = get_random_k_t() for t in range(evaluation_timesteps): episode_t = len(episode_actions) if IS_LOCAL and episode_t >= random_t: self.env.render() if episode_t < k_random_actions or episode_t == random_t: next_action = behavior_policy.act( obs, explore=True, explore_eps=1, ) else: next_action = behavior_policy.act(obs, explore=False) obs1, reward, done, _ = self.env.step(next_action) episode_rewards.append(reward) episode_states.append(obs) episode_actions.append(next_action) obs = obs1 if done: for i, (o, a) in enumerate( zip(episode_states[random_t:], episode_actions[random_t:])): weighted_rewards = [ r * self.gamma**j for j, r in enumerate(episode_rewards[random_t + i:]) ] reward_to_go = sum(weighted_rewards) self.replay_buffer_mc.add( o, a, reward_to_go, None, None, ) # Update model. obses_t, actions, rewards, _, _ = self.replay_buffer_mc.sample( self.batch_size) weights = np.ones_like(rewards) td_errors = self.train_mc(obses_t, actions, rewards, weights) # print(rewards) # print(td_errors) #print(self.get_q_values(o)[a], reward_to_go) # print('----') simulated_t = t - len(episode_rewards) + random_t + i if simulated_t % self.train_log_freq == 0: self._log_training_details( episode=ep, t=simulated_t, td_max=np.max(np.abs(td_errors)), td_mean=np.mean(np.abs(td_errors)), smoothing=True, ) # Save model if (self.checkpoint_freq is not None and simulated_t % self.checkpoint_freq == 0): if self.print_freq is not None: logger.log("Saving model due to smoothing") # TODO: Check scope addition save_variables(self.model_file, scope=self.scope) # save_variables(self.model_file) self.model_saved = True obs = self.env.reset() episode_rewards = [] episode_states = [] episode_actions = [] ep += 1 k_random_actions, random_t = get_random_k_t() """ # Finish obs = obs1 self.t += 1 if done: self.episode_rewards.append(0.0) self.training_episode += 1 obs = self.env.reset() """ # TODO: Check that model isn't getting worse? # TODO: Reload last best saved model like in self.end_learning? @property def mean_100ep_reward(self): return round(np.mean(self.episode_rewards[-101:-1]), 1) @property def mean_100ep_discounted_reward(self): return round(np.mean(self.discounted_episode_rewards[-101:-1]), 1) @property def mean_100ep_start_value(self): return round(np.mean(self.start_values[-100:]), 1) @property def mean_100ep_lunar_crashes(self): return round(np.mean(self.lunar_crashes[-100:]), 1) @property def mean_100ep_lunar_goals(self): return round(np.mean(self.lunar_goals[-100:]), 1) @property def mean_100ep_length(self): return round(np.mean(self.episode_lengths[-100:]), 1) def update(self, s, a, s1, r, done, verbose=False, freeze_buffer=False): # get_session() obs = s new_obs = s1 action = a rew = r # Store transition in the replay buffer. if not freeze_buffer: self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 self.discounted_episode_rewards[-1] += rew * \ self.gamma ** self.episode_t if self.start_values[-1] is None: self.start_values[-1] = max(self.get_q_values(s)) if rew == -100: self.lunar_crashes[-1] = 1 elif rew == 100: self.lunar_goals[-1] = 1 mean_100ep_reward = self.mean_100ep_reward td_errors = None if self.t > self.learning_starts and self.t % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t), ) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.prioritized_replay: new_priorities = np.abs(td_errors) + \ self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) if self.t > self.learning_starts and self.t % self.target_network_update_freq == 0: # Update target network periodically. self.update_target() saved = False if (self.checkpoint_freq is not None and self.t > self.learning_starts and self.training_episode > 100 and self.t % self.checkpoint_freq == 0): if (self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward or (self.save_reward_threshold is not None and mean_100ep_reward >= self.save_reward_threshold)): saved = True if self.print_freq is not None: logger.log( "Saving model due to mean reward increase (or mean reward above {}): {} -> {}" .format( self.save_reward_threshold if self.save_reward_threshold is not None else 'NULL', self.saved_mean_reward, mean_100ep_reward)) # TODO: Check scope addition save_variables(self.model_file, scope=self.scope) # save_variables(self.model_file) self.model_saved = True self.saved_mean_reward = mean_100ep_reward if self.t > self.learning_starts and self.t % self.train_log_freq == 0: self._log_training_details( episode=self.training_episode, t=self.t, td_max=np.max(np.abs(td_errors)), td_mean=np.mean(np.abs(td_errors)), r_mean=mean_100ep_reward, r_mean_discounted=self.mean_100ep_discounted_reward, v_mean=self.mean_100ep_start_value, n_crashes_mean=self.mean_100ep_lunar_crashes, n_goals_mean=self.mean_100ep_lunar_goals, saved_model=saved, ) self.t += 1 self.episode_t += 1 if done: self.start_values.append(None) self.episode_rewards.append(0.0) self.episode_lengths.append(0) self.lunar_crashes.append(0) self.lunar_goals.append(0) self.discounted_episode_rewards.append(0.0) self.training_episode += 1 self.episode_t = 0 def end_learning(self): if self.model_saved: if self.print_freq is not None: logger.log("Restored model with mean reward: {}".format( self.saved_mean_reward)) # TODO: Check scope addition load_variables(self.model_file, scope=self.scope) # load_variables(self.model_file) def close(self): if self.td is not None: import shutil shutil.rmtree(self.td)
action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards += rew if done: env.render() obs = env.reset() y_s[i,j] = episode_rewards break # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) #change to dynamic train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) #is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 #if is_solved: # Show off the result #env.render() #else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. #if t > 1000: #obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) #change to dynamic #train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % target_update == 0: update_target()