class Runner: def __init__(self, args, env): self.noise = args.noise_eps self.epsilon = args.epsilon self.env = env self.agent = Agent(args) self.her_module = HerSampler(args.replay_strategy, args.replay_k, env.compute_reward) self.buffer = Buffer(args, self.her_module.sample_her_transitions) self.worker = RolloutWorker(self.env, self.agent, args) self.args = args def run(self): success_rates = [] for epoch in tqdm(range(self.args.n_epochs)): for episode_idx in range(self.args.n_cycles): episode = self.worker.generate_episode(self.noise, self.epsilon) episode_batch = convert_episode_to_batch_major( episode) # 把episode中的二维数据变成三维的 self.buffer.store_episode(episode_batch) episode_batch['o_next'], episode_batch[ 'ag_next'] = episode_batch['o'][:, 1:], episode_batch[ 'ag'][:, 1:] transitions = self.her_module.sample_her_transitions( episode_batch, self.args.episode_limit) # update the normalizer self.agent.update_normalizer(transitions) for _ in range(self.args.n_batches): transitions = self.buffer.sample(self.args.batch_size) self.agent.learn(transitions) # self.noise = max(0, self.noise - 0.001) # self.epsilon = max(0.05, self.noise - 0.001) if len(success_rates) > 0 and success_rates[-1] > 0.5: success_rate = self.worker.evaluate(render=True) else: success_rate = self.worker.evaluate() success_rates.append(success_rate) save_path = self.args.save_dir + '/' + self.args.env_name plt.figure() plt.plot(range(self.args.n_epochs), success_rates) plt.xlabel('epoch') plt.ylabel('success_rate') plt.savefig(save_path + '/plt.png', format='png')
class Runner: def __init__(self, args, env): self.args = args self.noise = args.noise_rate self.epsilon = args.epsilon self.episode_limit = args.max_episode_len self.env = env self.agents = self._init_agents() self.buffer = Buffer(args) self.save_path = self.args.save_dir + '/' + self.args.scenario_name if not os.path.exists(self.save_path): os.makedirs(self.save_path) def _init_agents(self): agents = [] for i in range(self.args.n_agents): agent = Agent(i, self.args) agents.append(agent) return agents def run(self): returns = [] for time_step in tqdm(range(self.args.time_steps)): # reset the environment if time_step % self.episode_limit == 0: s = self.env.reset() u = [] actions = [] with torch.no_grad(): for agent_id, agent in enumerate(self.agents): action = agent.select_action(s[agent_id], self.noise, self.epsilon) u.append(action) actions.append(action) for i in range(self.args.n_agents, self.args.n_players): actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) s_next, r, done, info = self.env.step(actions) self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents]) s = s_next if self.buffer.current_size >= self.args.batch_size: transitions = self.buffer.sample(self.args.batch_size) for agent in self.agents: other_agents = self.agents.copy() other_agents.remove(agent) agent.learn(transitions, other_agents) if time_step > 0 and time_step % self.args.evaluate_rate == 0: returns.append(self.evaluate()) plt.figure() plt.plot(range(len(returns)), returns) plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit)) plt.ylabel('average returns') plt.savefig(self.save_path + '/plt.png', format='png') self.noise = max(0.05, self.noise - 0.0000005) self.epsilon = max(0.05, self.noise - 0.0000005) np.save(self.save_path + '/returns.pkl', returns) def evaluate(self): returns = [] for episode in range(self.args.evaluate_episodes): # reset the environment s = self.env.reset() rewards = 0 for time_step in range(self.args.evaluate_episode_len): self.env.render(mode='other') actions = [] with torch.no_grad(): for agent_id, agent in enumerate(self.agents): action = agent.select_action(s[agent_id], 0, 0) actions.append(action) for i in range(self.args.n_agents, self.args.n_players): actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) s_next, r, done, info = self.env.step(actions) rewards += r[0] s = s_next returns.append(rewards) print('Returns is', rewards) return sum(returns) / self.args.evaluate_episodes
class Runner: def __init__(self, args, env): self.args = args self.noise = args.noise_rate self.epsilon = args.epsilon self.episode_limit = args.max_episode_len self.env = env self.agents = self._init_agents() self.buffer = Buffer(args) self.save_path = self.args.save_dir if not os.path.exists(self.save_path): os.makedirs(self.save_path) def _init_agents(self): agents = [] for i in range(self.args.n_banks): agent = Agent(i, self.args) agents.append(agent) return agents def run(self): returns = [] average_net_position = float('-inf') # Run this loop repeatedly for time_step in tqdm(range(self.args.time_steps)): # reset the environment and get the first sample state, _ = self.env.reset(evaluate=False) u = [] actions = [] with torch.no_grad(): # For each agent for agent_id, agent in enumerate(self.agents): # select an action action = agent.select_action(state[agent_id], self.noise, self.epsilon) # store the action u.append(action) actions.append(action) # Take the next action; retrieve next state, reward, done, and additional information next_state, reward, done, info = self.env.step(actions) # Store the episode in the replay buffer self.buffer.store_episode(state[:self.args.n_banks], u, reward[:self.args.n_banks], next_state[:self.args.n_banks]) # Update the state state = next_state # If there are enough samples in the buffer if self.buffer.current_size >= self.args.batch_size: # Get a sample from the buffer of (s,a,r,s') transitions = self.buffer.sample(self.args.batch_size) # Train each agent for agent in self.agents: # Get a list of the other agents other_agents = self.agents.copy() other_agents.remove(agent) # Train the current agent on the world transitions agent.learn(transitions, other_agents) # Evaluate the learning if time_step >0 and time_step % self.args.evaluate_rate == 0: print(f'Timestep {time_step}: Conducting an evaluation:') average_net_position = self.evaluate(self.args) returns.append(average_net_position) # Generate Noise self.noise = max(0.05, self.noise - 0.0000005) self.epsilon = max(0.05, self.noise - 0.0000005) # Save the returns np.save(f'{self.save_path}/returns.pkl', returns) def evaluate(self, args=None): # Allocate lists to store results from info initial_net_positions = [] net_positions = [] system_configurations = [] for episode in range(self.args.evaluate_episodes): # reset the environment s, info = self.env.reset(evaluate=True) system_configurations.append({'initial_configurations':copy.deepcopy(info)}) initial_net_positions.append(info['net_position']) # Obtain the results for a series of trainings for time_step in range(self.args.evaluate_episode_len): actions = [] # Zero out the gradients with torch.no_grad(): for agent_id, agent in enumerate(self.agents): # Select the action for the given agent action = agent.select_action(s[agent_id], 0, 0) actions.append(action) # Establish a baseline by doing nothing if self.args.do_nothing: actions = np.zeros((self.args.n_banks, self.args.n_banks)) # Take the next action s_next, rewards, done, info = self.env.step(actions) # Update the state s = s_next # Store the action taken system_configurations[-1]['action'] = copy.deepcopy(actions) # Store the cumulative rewards net_positions.append(info['net_position']) system_configurations[-1]['final_configurations'] = copy.deepcopy(info) print(f'Average starting net position: {np.mean(initial_net_positions)}') print(f'Average ending net position: {np.mean(net_positions)}') save_path = f"./data/{args.reward_type}/disable-default-actions-{args.disable_default_actions}" if not os.path.exists(save_path): os.mkdir(save_path) np.save(f"{save_path}/evaluation-data", system_configurations) return np.mean(net_positions)