def sampler_worker(config, replay_queue, batch_queue, training_on, global_episode, update_step, log_dir=''): """ Function that transfers replay to the buffer and batches from buffer to the queue. Args: config: replay_queue: batch_queue: training_on: global_episode: log_dir: """ batch_size = config['batch_size'] logger = Logger(f"{log_dir}/data_struct") # Create replay buffer replay_buffer = ReplayBuffer(state_dim=config["state_dim"], action_dim=config["action_dim"], max_size=config["replay_mem_size"], save_dir=config["results_path"]) while training_on.value: # (1) Transfer replays to global buffer n = replay_queue.qsize() for _ in range(n): replay = replay_queue.get() replay_buffer.add(*replay) # (2) Transfer batch of replay from buffer to the batch_queue if len(replay_buffer) < batch_size: continue try: batch = replay_buffer.sample(batch_size) batch_queue.put_nowait(batch) except: sleep(0.1) continue # Log data structures sizes step = update_step.value logger.scalar_summary("data_struct/global_episode", global_episode.value, step) logger.scalar_summary("data_struct/replay_queue", replay_queue.qsize(), step) logger.scalar_summary("data_struct/batch_queue", batch_queue.qsize(), step) logger.scalar_summary("data_struct/replay_buffer", len(replay_buffer), step) empty_torch_queue(batch_queue) print("Stop sampler worker.")
def run(self, training_on, batch_queue, update_step): while update_step.value < self.num_train_steps: try: batch = batch_queue.get_nowait() except queue.Empty: continue self._update_step(batch, update_step) update_step.value += 1 if update_step.value % 1000 == 0: print("Training step ", update_step.value) training_on.value = 0 empty_torch_queue(self.learner_w_queue) print("Exit learner.")
def run(self, training_on, replay_queue, learner_w_queue, update_step): # Initialise deque buffer to store experiences for N-step returns self.exp_buffer = deque() best_reward = -float("inf") rewards = [] while training_on.value: episode_reward = 0 num_steps = 0 self.local_episode += 1 self.global_episode.value += 1 self.exp_buffer.clear() if self.local_episode % 100 == 0: print(f"Agent: {self.n_agent} episode {self.local_episode}") ep_start_time = time.time() state = self.env_wrapper.reset() self.ou_noise.reset() done = False while not done: action = self.actor.get_action(state) if self.agent_type == "exploration": action = self.ou_noise.get_action(action, num_steps) action = action.squeeze(0) else: action = action.detach().cpu().numpy().flatten() next_state, reward, done = self.env_wrapper.step(action) episode_reward += reward state = self.env_wrapper.normalise_state(state) reward = self.env_wrapper.normalise_reward(reward) self.exp_buffer.append((state, action, reward)) # We need at least N steps in the experience buffer before we can compute Bellman # rewards and add an N-step experience to replay memory if len(self.exp_buffer) >= self.config['n_step_returns']: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] # We want to fill buffer only with form explorator if self.agent_type == "exploration": try: replay_queue.put_nowait([ state_0, action_0, discounted_reward, next_state, done, gamma ]) except: pass state = next_state if done or num_steps == self.max_steps: # add rest of experiences remaining in buffer while len(self.exp_buffer) != 0: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] if self.agent_type == "exploration": try: replay_queue.put_nowait([ state_0, action_0, discounted_reward, next_state, done, gamma ]) except: pass break num_steps += 1 # Log metrics step = update_step.value self.logger.scalar_summary("agent/reward", episode_reward, step) self.logger.scalar_summary("agent/episode_timing", time.time() - ep_start_time, step) # Saving agent reward_outperformed = episode_reward - best_reward > self.config[ "save_reward_threshold"] time_to_save = self.local_episode % self.num_episode_save == 0 if self.n_agent == 0 and (time_to_save or reward_outperformed): if episode_reward > best_reward: best_reward = episode_reward self.save( f"local_episode_{self.local_episode}_reward_{best_reward:4f}" ) rewards.append(episode_reward) if self.agent_type == "exploration" and self.local_episode % self.config[ 'update_agent_ep'] == 0: self.update_actor_learner(learner_w_queue, training_on) empty_torch_queue(replay_queue) print(f"Agent {self.n_agent} done.")
def run(self, training_on, replay_queue, learner_w_queue, update_step): # Initialise deque buffer to store experiences for N-step returns self.exp_buffer = deque() best_reward = -float("inf") rewards = [] while training_on.value: episode_reward = 0 num_steps = 0 self.local_episode += 1 self.global_episode.value += 1 self.exp_buffer.clear() if self.local_episode % 100 == 0: print(f"Agent: {self.n_agent} episode {self.local_episode}") succeeded = 0.0 best_succeeded = 0.0 ep_start_time = time.time() state = self.env_wrapper.reset() current_pos = state[:self.config["action_dim"]] self.ou_noise.reset() done = False while not done: action = self.actor.get_action(state) # print("action mean: ", action) if self.agent_type == "exploration": action = self.ou_noise.get_action(action, num_steps) action = action.squeeze(0) # print("action with noise :", action) else: action = action.detach().cpu().numpy().flatten() next_a = action # if self.config["pos_control"]: # # print("current pos: ",current_pos) # next_a += current_pos for _ in range(self.config["action_repeat"]): next_state, reward, done = self.env_wrapper.step( next_a) # Step if done: break current_pos = next_state[:self.config["action_dim"]] # next_state, reward, done = self.env_wrapper.step(action) episode_reward += reward state = self.env_wrapper.normalise_state(state) reward = self.env_wrapper.normalise_reward(reward) self.exp_buffer.append((state, action, reward)) # We need at least N steps in the experience buffer before we can compute Bellman # rewards and add an N-step experience to replay memory if len(self.exp_buffer) >= self.config['n_step_returns']: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] # We want to fill buffer only with form explorator if self.agent_type == "exploration": try: replay_queue.put_nowait([ state_0, action_0, discounted_reward, next_state, done, gamma ]) except: pass state = next_state if done or num_steps >= self.max_steps: # if self.n_agent: # print("episode done. Step was ",num_steps) # add rest of experiences remaining in buffer while len(self.exp_buffer) != 0: state_0, action_0, reward_0 = self.exp_buffer.popleft() discounted_reward = reward_0 gamma = self.config['discount_rate'] for (_, _, r_i) in self.exp_buffer: discounted_reward += r_i * gamma gamma *= self.config['discount_rate'] if self.agent_type == "exploration": try: replay_queue.put_nowait([ state_0, action_0, discounted_reward, next_state, done, gamma ]) except: pass break num_steps += 1 # # Log metrics step = update_step.value if self.n_agent == 0: self.logger.scalar_summary("agent/reward", episode_reward, step) self.logger.scalar_summary("agent/episode_timing", time.time() - ep_start_time, step) rewards.append(episode_reward) if self.agent_type == "exploration" and self.local_episode % self.config[ 'update_agent_ep'] == 0: self.update_actor_learner(learner_w_queue, training_on) ########################### if self.local_episode % 100 == 0: print("evaluate") avg_reward = 0. episodes = 20 succeeded = 0 rewards = [] for _ in range(episodes): episode_reward = 0 num_steps = 0 state = self.env_wrapper.reset() current_pos = state[:self.config["action_dim"]] self.ou_noise.reset() done = False while not done: action = self.actor.get_action(state) action = action.detach().cpu().numpy().flatten() next_a = action # if self.config["pos_control"]: # # print("current pos: ",current_pos) # next_a += current_pos for _ in range(self.config["action_repeat"]): next_state, reward, done = self.env_wrapper.step( next_a) # Step if done: break current_pos = next_state[:self.config["action_dim"]] # next_state, reward, done = self.env_wrapper.step(action) episode_reward += reward state = self.env_wrapper.normalise_state(state) reward = self.env_wrapper.normalise_reward(reward) state = next_state if done or num_steps >= self.max_steps: if abs(self.env_wrapper.env.env.get_doorangle() ) >= 0.2: succeeded += 1 # else: # print("not opened") num_steps += 1 avg_reward += episode_reward avg_reward /= episodes succeeded /= episodes if self.n_agent == 0: self.logger.scalar_summary("agent/test", avg_reward, step) self.logger.scalar_summary("agent/success_rate", succeeded, step) print("----------------------------------------") print( "Test Episodes: {}, Avg. Reward: {}, Success rate {}% per {} trials" .format(episodes, round(avg_reward, 2), round(succeeded, 2), episodes)) print("----------------------------------------") # Saving agent reward_outperformed = succeeded - best_succeeded > self.config[ "save_success_rate_threshold"] # reward_outperformed = episode_reward - best_reward > self.config["save_reward_threshold"] time_to_save = self.local_episode % self.num_episode_save == 0 if self.n_agent == 0 and (time_to_save or reward_outperformed): # print(time_to_save, reward_outperformed) # if episode_reward > best_reward: # best_reward = episode_reward if succeeded > best_succeeded: best_succeeded = succeeded self.save( f"agent-{self.n_agent}_local-episode-{self.local_episode}_step-{step}_success-{best_succeeded:4f}" ) ########################### empty_torch_queue(replay_queue) print(f"Agent {self.n_agent} done.")
def sampler_worker(config, replay_queue, batch_queue, replay_priorities_queue, training_on, global_episode, update_step, log_dir=''): """ Function that transfers replay to the buffer and batches from buffer to the queue. Args: config: replay_queue: batch_queue: training_on: global_episode: log_dir: """ batch_size = config['batch_size'] # logger = Logger(f"{log_dir}/data_struct") # Create replay buffer replay_buffer = create_replay_buffer(config) while training_on.value: # (1) Transfer replays to global buffer n = replay_queue.qsize() for _ in range(n): replay = replay_queue.get() replay_buffer.add(*replay) # (2) Transfer batch of replay from buffer to the batch_queue if len(replay_buffer) < batch_size: continue try: inds, weights = replay_priorities_queue.get_nowait() replay_buffer.update_priorities(inds, weights) except queue.Empty: pass try: batch = replay_buffer.sample(batch_size) batch_queue.put_nowait(batch) except: sleep(0.1) continue # Log data structures sizes step = update_step.value # logger.scalar_summary("data_stuct/global_episode", global_episode.value, step) # logger.scalar_summary("data_struct/replay_queue", replay_queue.qsize(), step) # logger.scalar_summary("data_struct/batch_queue", batch_queue.qsize(), step) # logger.scalar_summary("data_struct/replay_buffer", len(replay_buffer), step) if config['save_buffer_on_disk']: replay_buffer.dump(config["results_path"]) empty_torch_queue(batch_queue) print("Stop sampler worker.")