Пример #1
0
    def __init__(self,
                 config,
                 policy,
                 global_episode,
                 n_agent=0,
                 agent_type='exploration',
                 log_dir=''):
        print(f"Initializing agent {n_agent}...")
        self.config = config
        self.n_agent = n_agent
        self.agent_type = agent_type
        self.max_steps = config['max_ep_length']
        self.num_episode_save = config['num_episode_save']
        self.global_episode = global_episode
        self.local_episode = 0
        self.log_dir = log_dir

        # Create environment
        self.env_wrapper = create_env_wrapper(config)
        self.ou_noise = OUNoise(dim=config["action_dim"],
                                low=config["action_low"],
                                high=config["action_high"])
        self.ou_noise.reset()

        self.actor = policy
        print("Agent ", n_agent, self.actor.device)

        # Logger
        log_path = f"{log_dir}/agent-{n_agent}"
        self.logger = Logger(log_path)
Пример #2
0
    def __init__(self,
                 config,
                 policy,
                 target_policy,
                 learner_w_queue,
                 log_dir=''):
        """
        Args:
            config (dict): configuration
        """
        self.config = config
        hidden_dim = config['dense_size']
        value_lr = config['critic_learning_rate']
        policy_lr = config['actor_learning_rate']
        state_dim = config['state_dim']
        action_dim = config['action_dim']
        self.num_train_steps = config['num_steps_train']
        self.device = config['device']
        self.max_steps = config['max_ep_length']
        self.frame_idx = 0
        self.batch_size = config['batch_size']
        self.gamma = config['discount_rate']
        self.tau = config['tau']
        self.log_dir = log_dir
        self.logger = Logger(f"{log_dir}/learner")
        self.learner_w_queue = learner_w_queue

        # Noise process
        self.ou_noise = OUNoise(dim=config["action_dim"],
                                low=config["action_low"],
                                high=config["action_high"])

        # Value and policy nets
        self.value_net = ValueNetwork(state_dim,
                                      action_dim,
                                      hidden_dim,
                                      device=self.device)
        self.policy_net = policy  #PolicyNetwork(state_dim, action_dim, hidden_dim, device=self.device)
        self.target_value_net = copy.deepcopy(self.value_net)
        self.target_policy_net = target_policy  #copy.deepcopy(self.policy_net)

        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.value_criterion = nn.MSELoss(reduction='none')
Пример #3
0
    def __init__(self, config, policy_net, target_policy_net, learner_w_queue, log_dir=''):
        hidden_dim = config['dense_size']
        state_dim = config['state_dim']
        action_dim = config['action_dim']
        value_lr = config['critic_learning_rate']
        policy_lr = config['actor_learning_rate']
        self.v_min = config['v_min']
        self.v_max = config['v_max']
        self.num_atoms = config['num_atoms']
        self.device = config['device']
        self.max_steps = config['max_ep_length']
        self.num_train_steps = config['num_steps_train']
        self.batch_size = config['batch_size']
        self.tau = config['tau']
        self.gamma = config['discount_rate']
        self.log_dir = log_dir
        self.prioritized_replay = config['replay_memory_prioritized']
        self.learner_w_queue = learner_w_queue
        self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1)

        # self.logger = Logger(f"{log_dir}/learner")

        # Noise process
        self.ou_noise = OUNoise(dim=config["action_dim"], low=config["action_low"], high=config["action_high"])

        # Value and policy nets
        self.value_net = ValueNetwork(state_dim, action_dim, hidden_dim, self.v_min, self.v_max, self.num_atoms, device=self.device)
        self.policy_net = policy_net
        self.target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim, self.v_min, self.v_max, self.num_atoms, device=self.device)
        self.target_policy_net = target_policy_net

        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()):
            target_param.data.copy_(param.data)

        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        self.value_criterion = nn.BCELoss(reduction='none')
Пример #4
0
class Agent(object):
    def __init__(self,
                 config,
                 policy,
                 global_episode,
                 n_agent=0,
                 agent_type='exploration',
                 log_dir=''):
        print(f"Initializing agent {n_agent}...")
        self.config = config
        self.n_agent = n_agent
        self.agent_type = agent_type
        self.max_steps = config['max_ep_length']
        self.num_episode_save = config['num_episode_save']
        self.global_episode = global_episode
        self.local_episode = 0
        self.log_dir = log_dir

        # Create environment
        self.env_wrapper = create_env_wrapper(config)
        self.ou_noise = OUNoise(dim=config["action_dim"],
                                low=config["action_low"],
                                high=config["action_high"])
        self.ou_noise.reset()

        self.actor = policy
        print("Agent ", n_agent, self.actor.device)

        # Logger
        log_path = f"{log_dir}/agent-{n_agent}"
        self.logger = Logger(log_path)

    def update_actor_learner(self, learner_w_queue, training_on):
        """Update local actor to the actor from learner. """
        if not training_on.value:
            return
        try:
            source = learner_w_queue.get_nowait()
        except:
            return
        target = self.actor
        for target_param, source_param in zip(target.parameters(), source):
            w = torch.tensor(source_param).float()
            target_param.data.copy_(w)
        del source

    def run(self, training_on, replay_queue, learner_w_queue, update_step):
        # Initialise deque buffer to store experiences for N-step returns
        self.exp_buffer = deque()

        best_reward = -float("inf")
        rewards = []
        while training_on.value:
            episode_reward = 0
            num_steps = 0
            self.local_episode += 1
            self.global_episode.value += 1
            self.exp_buffer.clear()

            if self.local_episode % 100 == 0:
                print(f"Agent: {self.n_agent}  episode {self.local_episode}")

            ep_start_time = time.time()
            state = self.env_wrapper.reset()
            self.ou_noise.reset()
            done = False
            while not done:
                action = self.actor.get_action(state)
                if self.agent_type == "exploration":
                    action = self.ou_noise.get_action(action, num_steps)
                    action = action.squeeze(0)
                else:
                    action = action.detach().cpu().numpy().flatten()
                next_state, reward, done = self.env_wrapper.step(action)

                episode_reward += reward

                state = self.env_wrapper.normalise_state(state)
                reward = self.env_wrapper.normalise_reward(reward)

                self.exp_buffer.append((state, action, reward))

                # We need at least N steps in the experience buffer before we can compute Bellman
                # rewards and add an N-step experience to replay memory
                if len(self.exp_buffer) >= self.config['n_step_returns']:
                    state_0, action_0, reward_0 = self.exp_buffer.popleft()
                    discounted_reward = reward_0
                    gamma = self.config['discount_rate']
                    for (_, _, r_i) in self.exp_buffer:
                        discounted_reward += r_i * gamma
                        gamma *= self.config['discount_rate']
                    # We want to fill buffer only with form explorator
                    if self.agent_type == "exploration":
                        try:
                            replay_queue.put_nowait([
                                state_0, action_0, discounted_reward,
                                next_state, done, gamma
                            ])
                        except:
                            pass

                state = next_state

                if done or num_steps == self.max_steps:
                    # add rest of experiences remaining in buffer
                    while len(self.exp_buffer) != 0:
                        state_0, action_0, reward_0 = self.exp_buffer.popleft()
                        discounted_reward = reward_0
                        gamma = self.config['discount_rate']
                        for (_, _, r_i) in self.exp_buffer:
                            discounted_reward += r_i * gamma
                            gamma *= self.config['discount_rate']
                        if self.agent_type == "exploration":
                            try:
                                replay_queue.put_nowait([
                                    state_0, action_0, discounted_reward,
                                    next_state, done, gamma
                                ])
                            except:
                                pass
                    break

                num_steps += 1

            # Log metrics
            step = update_step.value
            self.logger.scalar_summary("agent/reward", episode_reward, step)
            self.logger.scalar_summary("agent/episode_timing",
                                       time.time() - ep_start_time, step)

            # Saving agent
            reward_outperformed = episode_reward - best_reward > self.config[
                "save_reward_threshold"]
            time_to_save = self.local_episode % self.num_episode_save == 0
            if self.n_agent == 0 and (time_to_save or reward_outperformed):
                if episode_reward > best_reward:
                    best_reward = episode_reward
                self.save(
                    f"local_episode_{self.local_episode}_reward_{best_reward:4f}"
                )

            rewards.append(episode_reward)
            if self.agent_type == "exploration" and self.local_episode % self.config[
                    'update_agent_ep'] == 0:
                self.update_actor_learner(learner_w_queue, training_on)

        empty_torch_queue(replay_queue)
        print(f"Agent {self.n_agent} done.")

    def save(self, checkpoint_name):
        process_dir = f"{self.log_dir}/agent_{self.n_agent}"
        if not os.path.exists(process_dir):
            os.makedirs(process_dir)
        model_fn = f"{process_dir}/{checkpoint_name}.pt"
        torch.save(self.actor, model_fn)

    def save_replay_gif(self, output_dir_name):
        import matplotlib.pyplot as plt

        dir_name = output_dir_name
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        state = self.env_wrapper.reset()
        for step in range(self.max_steps):
            action = self.actor.get_action(state)
            action = action.cpu().detach().numpy()
            next_state, reward, done = self.env_wrapper.step(action)
            img = self.env_wrapper.render()
            plt.imsave(fname=f"{dir_name}/{step}.png", arr=img)
            state = next_state
            if done:
                break

        fn = f"{self.config['env']}-{self.config['model']}-{step}.gif"
        make_gif(dir_name, f"{self.log_dir}/{fn}")
        shutil.rmtree(dir_name, ignore_errors=False, onerror=None)
        print("fig saved to ", f"{self.log_dir}/{fn}")
Пример #5
0
    def __init__(self,
                 config,
                 policy_net,
                 target_policy_net,
                 learner_w_queue,
                 log_dir=''):
        hidden_dim = config['dense_size']
        state_dim = config['state_dims']
        action_dim = config['action_dims']
        value_lr = config['critic_learning_rate']
        policy_lr = config['actor_learning_rate']
        self.best_policy_loss = 10000
        self.best_value_loss = 10000
        v_min = config['v_min']
        v_max = config['v_max']
        self.path_weight_value = config['value_weights']
        self.path_weight_policy = config['policy_weights']
        self.run_name = config['run_name']
        num_atoms = config['num_atoms']
        self.counter = 0
        self.device = config['device']
        self.max_steps = config['max_ep_length']
        self.num_train_steps = config['num_steps_train']
        self.batch_size = config['batch_size']
        self.tau = config['tau']
        self.gamma = config['discount_rate']
        self.log_dir = log_dir
        self.prioritized_replay = config['replay_memory_prioritized']
        self.learner_w_queue = learner_w_queue

        self.logger = Logger(f"{log_dir}/learner",
                             name="{}/learner".format(self.run_name),
                             project_name=config["project_name"])
        self.path_weight_run = self.logger.get_log_dir()
        if not os.path.exists(self.path_weight_run):
            os.makedirs(self.path_weight_run)

        # Noise process
        self.ou_noise = OUNoise(dim=config["action_dim"],
                                low=config["action_low"],
                                high=config["action_high"])

        # Value and policy nets
        self.value_net = ValueNetwork(state_dim,
                                      action_dim,
                                      hidden_dim,
                                      v_min,
                                      v_max,
                                      num_atoms,
                                      device=self.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             action_dim,
                                             hidden_dim,
                                             v_min,
                                             v_max,
                                             num_atoms,
                                             device=self.device)
        if os.path.exists(config['value_weights_best']):
            self.value_net.load_state_dict(
                torch.load(config['value_weights_best']))
            self.target_value_net = copy.deepcopy(self.value_net)
        else:
            print("cannot load value_net: {}".format(
                config['value_weights_best']))

        self.policy_net = policy_net  #PolicyNetwork(state_dim, action_dim, hidden_dim, device=self.device)
        self.target_policy_net = target_policy_net

        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_policy_net.parameters(),
                                       self.policy_net.parameters()):
            target_param.data.copy_(param.data)

        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=value_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.value_criterion = nn.BCELoss(reduction='none')
Пример #6
0
class Agent(object):
    def __init__(self,
                 config,
                 policy,
                 global_episode,
                 n_agent=0,
                 agent_type='exploration',
                 log_dir=''):
        print(f"Initializing agent {n_agent}...")
        self.config = config
        self.n_agent = n_agent
        self.agent_type = agent_type
        self.max_steps = config['max_ep_length']
        self.num_episode_save = config['num_episode_save']
        self.global_episode = global_episode
        self.local_episode = 0
        self.log_dir = log_dir

        # Create environment
        self.env_wrapper = create_env_wrapper(config)
        self.env_wrapper.env.set_agent(self.n_agent)
        self.ou_noise = OUNoise(dim=config["action_dim"],
                                low=config["action_low"],
                                high=config["action_high"])
        self.ou_noise.reset()

        self.actor = policy

        # Logger
        log_path = f"{log_dir}/agent-{n_agent}"
        self.logger = Logger(log_path)

    def update_actor_learner(self, learner_w_queue):
        """Update local actor to the actor from learner. """
        if learner_w_queue.empty():
            return
        source = learner_w_queue.get()
        target = self.actor
        for target_param, source_param in zip(target.parameters(), source):
            w = torch.tensor(source_param).float()
            target_param.data.copy_(w)

    def run(self, training_on, replay_queue, learner_w_queue, update_step):
        # Initialise deque buffer to store experiences for N-step returns
        self.exp_buffer = deque()

        best_reward = -float("inf")
        rewards = []
        while training_on.value:
            episode_reward = 0
            num_steps = 0
            self.local_episode += 1
            self.global_episode.value += 1
            self.exp_buffer.clear()
            if self.local_episode % 100 == 0:
                print(f"Agent: {self.n_agent}  episode {self.local_episode}")

            ep_start_time = time.time()
            print("call reset on agent {}".format(self.n_agent))
            state = self.env_wrapper.reset()
            print(state.shape)
            print("called reset on agent {}".format(self.n_agent))
            self.ou_noise.reset()
            self.env_wrapper.env.resume_simulator()
            done = False
            angle_avg = []
            distance_avg = []
            while not done:
                action = self.actor.get_action(state)
                if self.agent_type == "supervisor":
                    action = self.env_wrapper.env.get_supervised_action()
                elif self.agent_type == "exploration":
                    action = self.ou_noise.get_action(action, num_steps)
                    action = action.squeeze(0)
                else:
                    action = action.detach().cpu().numpy().flatten()
                next_state, reward, done = self.env_wrapper.step(action)
                angle_avg.append(state[0])
                distance_avg.append(math.hypot(state[1], state[2]))
                episode_reward += reward

                state = self.env_wrapper.normalise_state(state)
                reward = self.env_wrapper.normalise_reward(reward)

                self.exp_buffer.append((state, action, reward))

                # We need at least N steps in the experience buffer before we can compute Bellman
                # rewards and add an N-step experience to replay memory
                if len(self.exp_buffer) >= self.config['n_step_returns']:
                    state_0, action_0, reward_0 = self.exp_buffer.popleft()
                    discounted_reward = reward_0
                    gamma = self.config['discount_rate']
                    for (_, _, r_i) in self.exp_buffer:
                        discounted_reward += r_i * gamma
                        gamma *= self.config['discount_rate']
                    if not replay_queue.full():
                        replay_queue.put([
                            state_0, action_0, discounted_reward, next_state,
                            done, gamma
                        ])

                state = next_state

                if done or num_steps == self.max_steps:
                    print("agent {} done steps: {}/{}".format(
                        self.n_agent, num_steps, self.max_steps))
                    # add rest of experiences remaining in buffer
                    while len(self.exp_buffer) != 0:
                        #print("agent {} exp_buffer_len {}".format(self.n_agent, len(self.exp_buffer)))
                        state_0, action_0, reward_0 = self.exp_buffer.popleft()
                        discounted_reward = reward_0
                        gamma = self.config['discount_rate']
                        for (_, _, r_i) in self.exp_buffer:
                            #print("agent {} exp_buffer_len {}".format(self.n_agent, len(self.exp_buffer)))
                            discounted_reward += r_i * gamma
                            gamma *= self.config['discount_rate']
                        replay_queue.put([
                            state_0, action_0, discounted_reward, next_state,
                            done, gamma
                        ])
                    break

                num_steps += 1

            #print("agent {} finished if".format(self.n_agent))
            # Log metrics
            step = update_step.value
            if self.agent_type == "exploitation":
                self.logger.scalar_summary("agent/angle",
                                           np.rad2deg(np.mean(angle_avg)),
                                           step)
                self.logger.scalar_summary("agent/angle_var",
                                           np.rad2deg(np.var(angle_avg)), step)
                self.logger.scalar_summary("agent/distance",
                                           np.mean(distance_avg), step)
                self.logger.scalar_summary("agent/distance_var",
                                           np.var(distance_avg), step)
                observation_image = self.env_wrapper.env.get_current_observation_image(
                )
                if num_steps == self.max_steps:
                    self.logger.image_summar("agent/observation_end",
                                             observation_image, num_steps)
                else:
                    self.logger.image_summar(
                        "agent/observation_p_{:2.3f}".format(
                            discounted_reward), observation_image, num_steps)

            self.logger.scalar_summary("agent/reward", episode_reward, step)
            self.logger.scalar_summary("agent/episode_timing",
                                       time.time() - ep_start_time, step)

            # Saving agent
            if self.local_episode % self.num_episode_save == 0 or episode_reward > best_reward:
                if episode_reward > best_reward:
                    best_reward = episode_reward
                self.save(
                    f"local_episode_{self.local_episode}_reward_{best_reward:4f}"
                )
                print("reward is: {} step: {} ".format(episode_reward, step))

            rewards.append(episode_reward)
            if (self.agent_type == "exploration"
                    or self.agent_type == "supervisor"
                ) and self.local_episode % self.config['update_agent_ep'] == 0:
                self.update_actor_learner(learner_w_queue)

        # while not replay_queue.empty():
        #     replay_queue.get()

        # Save replay from the first agent only
        # if self.n_agent == 0:
        #    self.save_replay_gif()

        #print(f"Agent {self.n_agent} done.")

    def save(self, checkpoint_name):
        last_path = f"{self.log_dir}"
        process_dir = f"{self.log_dir}/agent_{self.n_agent}"
        if not os.path.exists(process_dir):
            os.makedirs(process_dir)
        if not os.path.exists(last_path):
            os.makedirs(last_path)
        model_fn = f"{process_dir}/{checkpoint_name}.pt"
        torch.save(self.actor, model_fn)
        model_fn = f"{last_path}/best.pt"
        torch.save(self.actor, model_fn)

    def save_replay_gif(self):
        dir_name = "replay_render"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        state = self.env_wrapper.reset()
        self.env_wrapper.env.resume_simulator()
        for step in range(self.max_steps):
            action = self.actor.get_action(state)
            action = action.cpu().detach().numpy()
            next_state, reward, done = self.env_wrapper.step(action)
            img = self.env_wrapper.render()
            plt.imsave(fname=f"{dir_name}/{step}.png", arr=img)
            state = next_state
            if done:
                break

        fn = f"{self.config['env']}-{self.config['model']}-{step}.gif"
        make_gif(dir_name, f"{self.log_dir}/{fn}")
        shutil.rmtree(dir_name, ignore_errors=False, onerror=None)
        print("fig saved to ", f"{self.log_dir}/{fn}")