Exemplo n.º 1
0
 def __init__(self, args, env):
     self.noise = args.noise_eps
     self.epsilon = args.epsilon
     self.env = env
     self.agent = Agent(args)
     self.her_module = HerSampler(args.replay_strategy, args.replay_k,
                                  env.compute_reward)
     self.buffer = Buffer(args, self.her_module.sample_her_transitions)
     self.worker = RolloutWorker(self.env, self.agent, args)
     self.args = args
Exemplo n.º 2
0
 def __init__(self, args, env):
     self.args = args
     self.noise = args.noise_rate
     self.epsilon = args.epsilon
     self.episode_limit = args.max_episode_len
     self.env = env
     self.agents = self._init_agents()
     self.buffer = Buffer(args)
     self.save_path = self.args.save_dir + '/' + self.args.scenario_name
     if not os.path.exists(self.save_path):
         os.makedirs(self.save_path)
Exemplo n.º 3
0
class Runner:
    def __init__(self, args, env):
        self.noise = args.noise_eps
        self.epsilon = args.epsilon
        self.env = env
        self.agent = Agent(args)
        self.her_module = HerSampler(args.replay_strategy, args.replay_k,
                                     env.compute_reward)
        self.buffer = Buffer(args, self.her_module.sample_her_transitions)
        self.worker = RolloutWorker(self.env, self.agent, args)
        self.args = args

    def run(self):
        success_rates = []
        for epoch in tqdm(range(self.args.n_epochs)):
            for episode_idx in range(self.args.n_cycles):
                episode = self.worker.generate_episode(self.noise,
                                                       self.epsilon)
                episode_batch = convert_episode_to_batch_major(
                    episode)  # 把episode中的二维数据变成三维的
                self.buffer.store_episode(episode_batch)
                episode_batch['o_next'], episode_batch[
                    'ag_next'] = episode_batch['o'][:, 1:], episode_batch[
                        'ag'][:, 1:]
                transitions = self.her_module.sample_her_transitions(
                    episode_batch, self.args.episode_limit)

                # update the normalizer
                self.agent.update_normalizer(transitions)

            for _ in range(self.args.n_batches):
                transitions = self.buffer.sample(self.args.batch_size)
                self.agent.learn(transitions)
            # self.noise = max(0, self.noise - 0.001)
            # self.epsilon = max(0.05, self.noise - 0.001)
            if len(success_rates) > 0 and success_rates[-1] > 0.5:
                success_rate = self.worker.evaluate(render=True)
            else:
                success_rate = self.worker.evaluate()
            success_rates.append(success_rate)
        save_path = self.args.save_dir + '/' + self.args.env_name
        plt.figure()
        plt.plot(range(self.args.n_epochs), success_rates)
        plt.xlabel('epoch')
        plt.ylabel('success_rate')
        plt.savefig(save_path + '/plt.png', format='png')
Exemplo n.º 4
0
class Runner:
    def __init__(self, args, env):
        self.args = args
        self.noise = args.noise_rate
        self.epsilon = args.epsilon
        self.episode_limit = args.max_episode_len
        self.env = env
        self.agents = self._init_agents()
        self.buffer = Buffer(args)
        self.save_path = self.args.save_dir + '/' + self.args.scenario_name
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def _init_agents(self):
        agents = []
        for i in range(self.args.n_agents):
            agent = Agent(i, self.args)
            agents.append(agent)
        return agents

    def run(self):
        returns = []
        for time_step in tqdm(range(self.args.time_steps)):
            # reset the environment
            if time_step % self.episode_limit == 0:
                s = self.env.reset()
            u = []
            actions = []
            with torch.no_grad():
                for agent_id, agent in enumerate(self.agents):
                    action = agent.select_action(s[agent_id], self.noise, self.epsilon)
                    u.append(action)
                    actions.append(action)
            for i in range(self.args.n_agents, self.args.n_players):
                actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
            s_next, r, done, info = self.env.step(actions)
            self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents])
            s = s_next
            if self.buffer.current_size >= self.args.batch_size:
                transitions = self.buffer.sample(self.args.batch_size)
                for agent in self.agents:
                    other_agents = self.agents.copy()
                    other_agents.remove(agent)
                    agent.learn(transitions, other_agents)

            if time_step > 0 and time_step % self.args.evaluate_rate == 0:
                returns.append(self.evaluate())
                plt.figure()
                plt.plot(range(len(returns)), returns)
                plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit))
                plt.ylabel('average returns')
                plt.savefig(self.save_path + '/plt.png', format='png')
            self.noise = max(0.05, self.noise - 0.0000005)
            self.epsilon = max(0.05, self.noise - 0.0000005)
            np.save(self.save_path + '/returns.pkl', returns)

    def evaluate(self):
        returns = []
        for episode in range(self.args.evaluate_episodes):
            # reset the environment
            s = self.env.reset()
            rewards = 0
            for time_step in range(self.args.evaluate_episode_len):
                self.env.render(mode='other')
                actions = []
                with torch.no_grad():
                    for agent_id, agent in enumerate(self.agents):
                        action = agent.select_action(s[agent_id], 0, 0)
                        actions.append(action)
                for i in range(self.args.n_agents, self.args.n_players):
                    actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
                s_next, r, done, info = self.env.step(actions)
                rewards += r[0]
                s = s_next
            returns.append(rewards)
            print('Returns is', rewards)
        return sum(returns) / self.args.evaluate_episodes
Exemplo n.º 5
0
class Runner:
    def __init__(self, args, env):
        self.args = args
        self.noise = args.noise_rate
        self.epsilon = args.epsilon
        self.episode_limit = args.max_episode_len
        self.env = env
        self.agents = self._init_agents()
        self.buffer = Buffer(args)
        self.save_path = self.args.save_dir
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def _init_agents(self):
        agents = []
        for i in range(self.args.n_banks):
            agent = Agent(i, self.args)
            agents.append(agent)
        return agents

    def run(self):
        returns = []
        average_net_position = float('-inf')

        # Run this loop repeatedly
        for time_step in tqdm(range(self.args.time_steps)):
            
            # reset the environment and get the first sample
            state, _ = self.env.reset(evaluate=False)

            u = []
            actions = []

            with torch.no_grad():
                # For each agent
                for agent_id, agent in enumerate(self.agents):
                    # select an action
                    action = agent.select_action(state[agent_id], self.noise, self.epsilon)
                    # store the action 
                    u.append(action)
                    actions.append(action)

            # Take the next action; retrieve next state, reward, done, and additional information
            next_state, reward, done, info = self.env.step(actions)

            # Store the episode in the replay buffer
            self.buffer.store_episode(state[:self.args.n_banks], u, reward[:self.args.n_banks], next_state[:self.args.n_banks])

            # Update the state
            state = next_state

            # If there are enough samples in the buffer
            if self.buffer.current_size >= self.args.batch_size:

                # Get a sample from the buffer of (s,a,r,s')
                transitions = self.buffer.sample(self.args.batch_size)

                # Train each agent
                for agent in self.agents:

                    # Get a list of the other agents
                    other_agents = self.agents.copy()
                    other_agents.remove(agent)

                    # Train the current agent on the world transitions
                    agent.learn(transitions, other_agents)

            # Evaluate the learning
            if time_step >0 and time_step % self.args.evaluate_rate == 0:
                print(f'Timestep {time_step}: Conducting an evaluation:')
                average_net_position = self.evaluate(self.args)
                returns.append(average_net_position)

            # Generate Noise
            self.noise = max(0.05, self.noise - 0.0000005)
            self.epsilon = max(0.05, self.noise - 0.0000005)

            # Save the returns
            np.save(f'{self.save_path}/returns.pkl', returns)


    def evaluate(self, args=None):
        # Allocate lists to store results from info
        initial_net_positions = []
        net_positions = []
        system_configurations = []


        for episode in range(self.args.evaluate_episodes):

            # reset the environment
            s, info = self.env.reset(evaluate=True)

            system_configurations.append({'initial_configurations':copy.deepcopy(info)})
            initial_net_positions.append(info['net_position'])

            # Obtain the results for a series of trainings
            for time_step in range(self.args.evaluate_episode_len):

                actions = []
                
                # Zero out the gradients
                with torch.no_grad():
                    for agent_id, agent in enumerate(self.agents):
                        # Select the action for the given agent
                        action = agent.select_action(s[agent_id], 0, 0)
                        actions.append(action)
                
                # Establish a baseline by doing nothing
                if self.args.do_nothing:
                    actions = np.zeros((self.args.n_banks, self.args.n_banks))

                # Take the next action
                s_next, rewards, done, info = self.env.step(actions)

                # Update the state
                s = s_next

                # Store the action taken
                system_configurations[-1]['action'] = copy.deepcopy(actions)
                
            # Store the cumulative rewards
            net_positions.append(info['net_position'])

            system_configurations[-1]['final_configurations'] = copy.deepcopy(info)

        print(f'Average starting net position: {np.mean(initial_net_positions)}')
        print(f'Average ending net position: {np.mean(net_positions)}')

        save_path = f"./data/{args.reward_type}/disable-default-actions-{args.disable_default_actions}"
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        np.save(f"{save_path}/evaluation-data", system_configurations)

        return np.mean(net_positions)