示例#1
0
    def __init__(self, env, n_hidden, log, tb_writer, name, args):
        super(Agent, self).__init__(
            env=env, n_hidden=n_hidden, log=log, tb_writer=tb_writer,
            name=name, args=args)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()
示例#2
0
    def set_policy(self):
        self.policy = NormalMLP(actor_input_dim=self.actor_input_dim,
                                actor_output_dim=self.actor_output_dim,
                                n_hidden=self.args.opponent_n_hidden,
                                max_action=self.max_action,
                                name=self.name + "_actor",
                                args=self.args)

        self.memory = ReplayBuffer()
示例#3
0
文件: agent.py 项目: EcustBoy/dc2g
    def __init__(self, env, tb_writer, log, args, name):
        super(Agent, self).__init__(env=env,
                                    log=log,
                                    tb_writer=tb_writer,
                                    args=args,
                                    name=name)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()
        self.epsilon = 1.0  # For exploration
示例#4
0
    def set_policy(self):
        self.policy = TD3(actor_input_dim=self.actor_input_dim,
                          actor_output_dim=self.actor_output_dim,
                          critic_input_dim=self.critic_input_dim,
                          n_hidden=self.args.prey_n_hidden,
                          max_action=self.max_action,
                          name=self.name,
                          args=self.args,
                          i_agent=self.i_agent)

        self.memory = ReplayBuffer()
示例#5
0
    def set_policy(self):
        self.policy = DDPG(actor_input_dim=self.actor_input_dim,
                           actor_output_dim=self.actor_output_dim,
                           critic_input_dim=self.critic_input_dim,
                           max_action=self.max_action,
                           min_action=self.min_action,
                           name=self.name,
                           args=self.args,
                           i_agent=self.i_agent,
                           env=self.env)

        self.memory = ReplayBuffer()
示例#6
0
    def __init__(self, env, tb_writer, log, args, name, i_agent):
        super(Manager, self).__init__(env=env,
                                      log=log,
                                      tb_writer=tb_writer,
                                      args=args,
                                      name=name,
                                      i_agent=i_agent)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()

        assert "manager" in self.name
示例#7
0
    def __init__(self, log, tb_writer, args):
        super(self.__class__, self).__init__()
        self.log = log
        self.tb_writer = tb_writer
        self.args = args
        self.loss_fn = MSELoss()

        self.net = OmniglotNet(self.loss_fn, args).to(device)

        self.fast_net = InnerLoop(self.loss_fn, args).to(device)

        self.opt = Adam(self.net.parameters(), lr=args.meta_lr)
        self.sampler = BatchSampler(args)
        self.memory = ReplayBuffer()
示例#8
0
    def __init__(self, env, tb_writer, log, args, name, i_agent):
        super(Teacher, self).__init__(env=env,
                                      log=log,
                                      tb_writer=tb_writer,
                                      args=args,
                                      name=name,
                                      i_agent=i_agent)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()
        self.tmp_memory = []
        self.n_advice = args.session / float(1 + args.n_eval)

        assert "teacher" in self.name
示例#9
0
class Agent(PolicyBase):
    def __init__(self, env, n_hidden, log, tb_writer, name, args):
        super(Agent, self).__init__(
            env=env, n_hidden=n_hidden, log=log, tb_writer=tb_writer,
            name=name, args=args)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()

    def set_dim(self):
        self.input_dim = self.env.observation_space.shape[0]
        self.output_dim = self.env.action_space.n

        self.log[self.args.log_name].info("[{}] actor input dim: {}".format(
            self.name, self.input_dim))
        self.log[self.args.log_name].info("[{}] actor output dim: {}".format(
            self.name, self.output_dim))
        self.log[self.args.log_name].info("[{}] number of hidden neurons: {}"
                                          .format(self.name, self.n_hidden))

    def select_stochastic_action(self, obs):
        # Get probabilities for different actions
        action, log_prob = self.policy.select_action(obs)
        assert not np.isnan(action).any()
        return action, log_prob

    def clear_memory(self):
        self.memory.clear()

    def add_memory(self, reward, log_prob):
        self.memory.add(reward, log_prob)

    def update_policy(self, total_eps):
        debug = self.policy.train(
            replay_buffer=self.memory,
            discount=self.args.discount)

        self.log[self.args.log_name].info(
            "Training loss: {}".format(debug['loss']))
        self.tb_writer.add_scalar(
            "loss", debug['loss'], total_eps)
示例#10
0
文件: agent.py 项目: EcustBoy/dc2g
class Agent(PolicyBase):
    def __init__(self, env, tb_writer, log, args, name):
        super(Agent, self).__init__(env=env,
                                    log=log,
                                    tb_writer=tb_writer,
                                    args=args,
                                    name=name)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()
        self.epsilon = 1.0  # For exploration

    def set_dim(self):
        self.input_dim = self.env.observation_space.spaces[
            "semantic_gridmap"].shape
        self.output_dim = self.env.action_space.n

        self.log[self.args.log_name].info("[{}] Input dim: {}".format(
            self.name, self.input_dim))
        self.log[self.args.log_name].info("[{}] Output dim: {}".format(
            self.name, self.output_dim))

    def select_deterministic_action(self, obs):
        action = self.policy.select_action(obs)
        assert not np.isnan(action).any()

        return action

    def select_stochastic_action(self, obs, total_timesteps):
        if np.random.rand() > self.epsilon:
            # Exploitation
            action = self.policy.select_action(obs)
        else:
            # Exploration
            action = np.random.randint(low=0, high=self.output_dim, size=(1, ))

            if self.epsilon > 0.1:
                self.epsilon *= 0.99999  # Reduce epsilon over time

        assert not np.isnan(action).any()

        self.tb_writer.add_scalar("debug/epsilon", self.epsilon,
                                  total_timesteps)

        return action

    def add_memory(self, obs, new_obs, action, reward, done):
        self.memory.add((obs, new_obs, action, reward, done))

    def clear_tmp_memory(self):
        self.tmp_memory.clear()

    def update_policy(self, total_timesteps):
        debug = self.policy.train(replay_buffer=self.memory, iterations=50)

        self.tb_writer.add_scalars("loss/critic",
                                   {self.name: debug["critic_loss"]},
                                   total_timesteps)

    def save(self, episode):
        self.policy.save("critic_" + str(episode), "./pytorch_models")

    def load(self, episode):
        self.policy.load("critic_" + str(episode), "./pytorch_models")
示例#11
0
class Prey(object):
    def __init__(self, env, log, tb_writer, args, name, i_agent):
        self.env = env
        self.log = log
        self.tb_writer = tb_writer
        self.args = args
        self.name = name + str(i_agent)
        self.i_agent = i_agent

        self.set_dim()
        self.set_policy()

        assert "prey" in self.name

    def set_dim(self):
        """
        NOTE that env.observation_space returns observation space for
        both predator and prey but with the order:
        [predator_1, predator_2, ..., prey_1]
        Thus the index of -1 is used
        """
        self.actor_input_dim = self.env.observation_space[-1].shape[0]
        self.actor_output_dim = self.env.action_space[0].shape[0]
        self.critic_input_dim = (self.actor_input_dim + self.actor_output_dim)
        self.max_action = float(self.env.action_space[0].high[0])

        self.log[self.args.log_name].info("[{0}] Actor input dim: {1}".format(
            self.name, self.actor_input_dim))
        self.log[self.args.log_name].info("[{0}] Actor output dim: {1}".format(
            self.name, self.actor_output_dim))
        self.log[self.args.log_name].info("[{0}] Critic input dim: {1}".format(
            self.name, self.critic_input_dim))
        self.log[self.args.log_name].info("[{0}] Max action: {1}".format(
            self.name, self.max_action))

    def set_policy(self):
        self.policy = TD3(actor_input_dim=self.actor_input_dim,
                          actor_output_dim=self.actor_output_dim,
                          critic_input_dim=self.critic_input_dim,
                          n_hidden=self.args.prey_n_hidden,
                          max_action=self.max_action,
                          name=self.name,
                          args=self.args,
                          i_agent=self.i_agent)

        self.memory = ReplayBuffer()

    def select_stochastic_action(self, obs, total_timesteps):
        if total_timesteps < self.args.start_timesteps:
            action = self.env.action_space[0].sample()
            assert not np.isnan(action).any()
        else:
            action = self.policy.select_action(obs)
            assert not np.isnan(action).any()
            if self.args.expl_noise != 0:
                noise = np.random.normal(
                    0,
                    self.args.expl_noise,
                    size=self.env.action_space[0].shape[0])
                action = (action + noise).clip(self.env.action_space[0].low,
                                               self.env.action_space[0].high)

        return action

    def select_deterministic_action(self, obs):
        action = self.policy.select_action(obs)
        assert not np.isnan(action).any()

        return action

    def add_memory(self, obs, new_obs, action, reward, done):
        self.memory.add((obs, new_obs, action, reward, done))

    def clear_memory(self):
        self.memory.clear()

    def update_policy(self, total_eps):
        debug = self.policy.train(replay_buffer=self.memory,
                                  iterations=self.args.ep_max_timesteps,
                                  batch_size=self.args.batch_size,
                                  discount=self.args.discount,
                                  tau=self.args.tau,
                                  policy_noise=self.args.policy_noise,
                                  noise_clip=self.args.noise_clip,
                                  policy_freq=self.args.policy_freq)

        self.tb_writer.add_scalars("loss/actor_loss",
                                   {self.name: debug["actor_loss"]}, total_eps)
        self.tb_writer.add_scalars("loss/critic_loss",
                                   {self.name: debug["critic_loss"]},
                                   total_eps)

        return debug

    def fix_name(self, weight):
        weight_fixed = OrderedDict()
        for k, v in weight.items():
            name_fixed = self.name
            for i_name, name in enumerate(k.split("_")):
                if i_name > 0:
                    name_fixed += "_" + name
            weight_fixed[name_fixed] = v

        return weight_fixed

    def sync(self, target_agent):
        self.log[self.args.log_name].info("[{}] Synced weight".format(
            self.name))

        actor = self.fix_name(target_agent.policy.actor.state_dict())
        self.policy.actor.load_state_dict(actor)

        actor_target = self.fix_name(
            target_agent.policy.actor_target.state_dict())
        self.policy.actor_target.load_state_dict(actor_target)

        critic = self.fix_name(target_agent.policy.critic.state_dict())
        self.policy.critic.load_state_dict(critic)

        critic_target = self.fix_name(
            target_agent.policy.critic_target.state_dict())
        self.policy.critic_target.load_state_dict(critic_target)

        self.policy.actor_optimizer = torch.optim.Adam(
            self.policy.actor.parameters(), lr=self.args.actor_lr)
        self.policy.critic_optimizer = torch.optim.Adam(
            self.policy.critic.parameters(), lr=self.args.critic_lr)

    def get_q_value(self, obs, action):
        obs = torch.FloatTensor(obs.reshape(1, -1)).to(device)
        action = torch.FloatTensor(action.reshape(1, -1)).to(device)

        return self.policy.critic.Q1(obs, action).cpu().data.numpy().flatten()

    def reset(self):
        self.log[self.args.log_name].info("[{}] Reset".format(self.name))
        self.set_policy()
        self.actor_loss_n = []
        self.critic_loss_n = []

    def save_weight(self, filename, directory):
        self.log[self.args.log_name].info("[{}] Saved weight".format(
            self.name))
        self.policy.save(filename, directory)

    def load_weight(self, filename, directory):
        self.log[self.args.log_name].info("[{}] Loaded weight".format(
            self.name))
        self.policy.load(filename, directory)

    def load_model(self, filename, directory):
        self.load_weight(filename, directory)

    def set_eval_mode(self):
        self.log[self.args.log_name].info("[{}] Set eval mode".format(
            self.name))

        self.policy.actor.eval()
        self.policy.actor_target.eval()
        self.policy.critic.eval()
        self.policy.critic_target.eval()

    def save_model(self, avg_eval_reward, total_ep_count):
        import pickle

        def save_pickle(obj, filename):
            with open(filename, "wb") as output:
                pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

        # Filename by converting it to percentage
        filename = \
            self.name + \
            "_reward" + "{:.3f}".format(avg_eval_reward) + \
            "_seed" + str(self.args.seed) + \
            "_ep" + str(total_ep_count)

        # Save loss history & memory
        snipshot = {}
        snipshot["actor_loss_n"] = self.actor_loss_n
        snipshot["critic_loss_n"] = self.critic_loss_n
        snipshot["memory"] = self.memory

        save_pickle(obj=snipshot, filename=filename + ".pkl")

        # Save weight
        self.save_weight(filename=filename, directory="./pytorch_models")
示例#12
0
class Teacher(PolicyBase):
    def __init__(self, env, tb_writer, log, args, name, i_agent):
        super(Teacher, self).__init__(env=env,
                                      log=log,
                                      tb_writer=tb_writer,
                                      args=args,
                                      name=name,
                                      i_agent=i_agent)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()
        self.tmp_memory = []
        self.n_advice = args.session / float(1 + args.n_eval)

        assert "teacher" in self.name

    def set_actor_input_dim(self):
        input_dim = 0

        # Add state (teacher obs, student obs)
        input_dim += self.env.observation_space[0].shape[0] * 2
        if self.args.manager_done:
            input_dim += 1 * 2

        # Add action (teacher action, student action, teacher action at)
        input_dim += self.env.action_space[0].shape[0] * 3

        # Add Q-values (teacher joint Q-value, student joint Q-value)
        input_dim += 2 * 2

        # Add reward mean
        input_dim += 2

        # Add teacher remain time within session
        input_dim += 1

        return input_dim

    def set_dim(self):
        self.actor_input_dim = self.set_actor_input_dim()
        self.actor_output_dim = self.env.action_space[0].shape[
            0] + 2  # +2 for when to advise (one-hot encoding)
        self.critic_input_dim = (self.actor_input_dim +
                                 self.actor_output_dim) * self.args.n_teacher
        self.max_action = float(self.env.action_space[0].high[0])
        self.n_hidden = self.args.teacher_n_hidden
        self.action_space = spaces.Box(
            low=-1,
            high=+1,
            shape=(self.env.action_space[0].shape[0], ),
            dtype=np.float32)

        self.log[self.args.log_name].info("[{}] Actor input dim: {}".format(
            self.name, self.actor_input_dim))
        self.log[self.args.log_name].info("[{}] Actor output dim: {}".format(
            self.name, self.actor_output_dim))
        self.log[self.args.log_name].info("[{}] Critic input dim: {}".format(
            self.name, self.critic_input_dim))
        self.log[self.args.log_name].info("[{}] Max action: {}".format(
            self.name, self.max_action))

    def select_stochastic_action(self, obs, total_timesteps):
        """Return stochastic action with added noise
        As in TD3, purely random noise is applied followed by Gaussian noise
        """
        if total_timesteps < self.args.teacher_start_timesteps:
            action_what = self.action_space.sample()
            action_when = np.zeros((2, ), dtype=np.float32)
            action_when[np.random.randint(low=0, high=2, size=(1, ))] = 1
        else:
            action_what, action_when = self.policy.select_action(obs)
            if self.args.expl_noise != 0:
                noise = np.random.normal(0,
                                         self.args.expl_noise,
                                         size=self.action_space.shape[0])
                action_what = (action_what + noise).clip(
                    self.action_space.low, self.action_space.high)

                if np.random.uniform() < 0.03:
                    action_when = np.zeros((2, ), dtype=np.float32)
                    action_when[np.random.randint(low=0, high=2,
                                                  size=(1, ))] = 1

        action = np.concatenate([action_what, action_when])
        assert not np.isnan(action).any()

        return action

    def update_memory(self, teacher_reward, temp_managers, train_rewards,
                      teacher_rewards):
        """Update memory
        The next observation is updated by replacing student Q-values with its updated temporary policy.
        Average rewards and remaining timestep are also updated.
        The measured teacher_reward is also updated.
        """
        self.corrected_memory = [[] for _ in range(5)
                                 ]  # 5: obs, new_obs, action, reward, done

        i_student = 1
        for i_exp, exp in enumerate(self.tmp_memory):
            # Update student_action
            obs_dict = exp[-1]

            # Update q-value that measured using updated student_critic
            q_values = get_q_values(
                temp_managers, obs_dict["manager_observations"],
                [obs_dict["manager_actions"][0], obs_dict["student_action"]])
            q_values = np.clip(q_values,
                               a_min=self.args.q_min,
                               a_max=self.args.q_max)

            obs_dict["q_with_student_critic"] = np.array([
                normalize(value=q_values[i_student],
                          min_value=self.args.q_min,
                          max_value=self.args.q_max)
            ])

            q_values = get_q_values(temp_managers,
                                    obs_dict["manager_observations"], [
                                        obs_dict["manager_actions"][0],
                                        obs_dict["teacher_action_at"]
                                    ])
            q_values = np.clip(q_values,
                               a_min=self.args.q_min,
                               a_max=self.args.q_max)

            obs_dict["q_at_with_student_critic"] = np.array([
                normalize(value=q_values[i_student],
                          min_value=self.args.q_min,
                          max_value=self.args.q_max)
            ])

            # Update avg_reward
            # Note that avg_train_reward = R_{Phase I}
            # Note that avg_teacher_reward = R_{Phase II}
            avg_train_reward, avg_teacher_reward = get_avg_reward(
                train_rewards=train_rewards,
                teacher_rewards=teacher_rewards,
                args=self.args)
            obs_dict["avg_train_reward"] = np.array([avg_train_reward])
            obs_dict["avg_teacher_reward"] = np.array([avg_teacher_reward])

            # Update teacher remain timestep
            obs_dict["remain_time"] = np.array([
                normalize(value=(self.n_advice -
                                 (obs_dict["session_advices"] + 1)),
                          min_value=0.,
                          max_value=float(self.n_advice))
            ])

            new_obs = concat_in_order(obs_dict, self.args)
            self.corrected_memory[0].append(exp[0])
            self.corrected_memory[1].append(new_obs)
            self.corrected_memory[2].append(exp[2])
            self.corrected_memory[3].append(teacher_reward)
            self.corrected_memory[4].append(exp[4])

        self.add_memory()
        self.clear_tmp_memory()

    def add_memory(self):
        self.memory.add(self.corrected_memory)

    def keep_memory(self, obs, new_obs, action, reward, done, obs_dict):
        self.tmp_memory.append([obs, new_obs, action, reward, done, obs_dict])

    def clear_tmp_memory(self):
        self.tmp_memory.clear()

    def update_policy(self, batch_size, total_timesteps):
        if len(self.memory) > self.args.ep_max_timesteps:
            debug = self.policy.train_teacher(
                replay_buffer=self.memory,
                iterations=self.args.ep_max_timesteps,
                batch_size=batch_size,
                discount=self.args.teacher_discount,
                tau=self.args.tau,
                policy_noise=self.args.policy_noise,
                noise_clip=self.args.noise_clip,
                policy_freq=self.args.policy_freq)

            self.log[self.args.log_name].info(
                "[{0}] Teacher actor loss {1} at {2}".format(
                    self.name, debug["actor_loss"], total_timesteps))
            self.tb_writer.add_scalars("loss/actor",
                                       {self.name: debug["actor_loss"]},
                                       total_timesteps)

            self.log[self.args.log_name].info(
                "[{0}] Teacher critic loss {1} at {2}".format(
                    self.name, debug["critic_loss"], total_timesteps))
            self.tb_writer.add_scalars("loss/critic",
                                       {self.name: debug["critic_loss"]},
                                       total_timesteps)
示例#13
0
class Agent(PolicyBase):
    def __init__(self, env, tb_writer, log, args, name):
        super(Agent, self).__init__(env=env,
                                    log=log,
                                    tb_writer=tb_writer,
                                    args=args,
                                    name=name)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()
        self.epsilon = 1  # For exploration

    def set_dim(self):
        self.actor_input_dim = self.env.observation_space.shape[0]
        self.actor_output_dim = self.env.action_space.n
        self.critic_input_dim = self.actor_input_dim + self.actor_output_dim
        self.n_hidden = self.args.n_hidden

        self.log[self.args.log_name].info("[{}] Actor input dim: {}".format(
            self.name, self.actor_input_dim))
        self.log[self.args.log_name].info("[{}] Actor output dim: {}".format(
            self.name, self.actor_output_dim))
        self.log[self.args.log_name].info("[{}] Critic input dim: {}".format(
            self.name, self.critic_input_dim))

    def select_deterministic_action(self, obs):
        action = self.policy.select_action(obs)
        assert not np.isnan(action).any()

        return action

    def select_stochastic_action(self, obs, total_timesteps):
        if np.random.rand() > self.epsilon:
            action = self.policy.select_action(obs)
        else:
            action = np.zeros((self.args.n_action, ), dtype=np.float32)
            action[np.random.randint(low=0,
                                     high=self.args.n_action,
                                     size=(1, ))] = 1

            if self.epsilon > 0.05:
                self.epsilon *= 0.9999  # Reduce epsilon over time

        assert not np.isnan(action).any()

        self.tb_writer.add_scalar("debug/epsilon", self.epsilon,
                                  total_timesteps)

        return action

    def add_memory(self, obs, new_obs, action, reward, done):
        self.memory.add((obs, new_obs, action, reward, done))

    def clear_tmp_memory(self):
        self.tmp_memory.clear()

    def update_policy(self, total_timesteps):
        if len(self.memory) > self.args.ep_max_timesteps:
            debug = self.policy.train(replay_buffer=self.memory,
                                      iterations=self.args.ep_max_timesteps,
                                      batch_size=self.args.batch_size,
                                      discount=self.args.discount,
                                      tau=self.args.tau,
                                      policy_freq=self.args.policy_freq)

            self.tb_writer.add_scalars("loss/actor",
                                       {self.name: debug["actor_loss"]},
                                       total_timesteps)
            self.tb_writer.add_scalars("loss/critic",
                                       {self.name: debug["critic_loss"]},
                                       total_timesteps)
示例#14
0
class Student(object):
    def __init__(self, env, log, args, name, i_agent):
        self.env = env
        self.log = log
        self.args = args
        self.name = name + str(i_agent)
        self.i_agent = i_agent

        self.set_dim()
        self.set_policy()
        self.set_noise()

        assert "student" in self.name

    def set_dim(self):
        self.actor_input_dim = len(self.env.agents[0].state)
        if self.args.student_done:
            self.actor_input_dim += 1
        self.actor_output_dim = 2
        self.critic_input_dim = self.actor_input_dim + self.actor_output_dim
        if self.args.student_train_type == "centralized":
            self.critic_input_dim += self.actor_output_dim
            self.critic_input_dim += (len(self.env.agents) -
                                      1) * self.actor_input_dim

        self.max_action = float(1.0)
        self.min_action = float(0.0)

        self.log[self.args.log_name].info("[{}] Actor input dim: {}".format(
            self.name, self.actor_input_dim))
        self.log[self.args.log_name].info("[{}] Actor output dim: {}".format(
            self.name, self.actor_output_dim))
        self.log[self.args.log_name].info("[{}] Critic input dim: {}".format(
            self.name, self.critic_input_dim))
        self.log[self.args.log_name].info("[{}] Max action: {}".format(
            self.name, self.max_action))

    def set_policy(self):
        self.policy = DDPG(actor_input_dim=self.actor_input_dim,
                           actor_output_dim=self.actor_output_dim,
                           critic_input_dim=self.critic_input_dim,
                           max_action=self.max_action,
                           min_action=self.min_action,
                           name=self.name,
                           args=self.args,
                           i_agent=self.i_agent,
                           env=self.env)

        self.memory = ReplayBuffer()

    def set_noise(self):
        if self.args.student_noise_type == "ou":
            self.exploration = OUNoise(action_dimension=self.actor_output_dim,
                                       theta=self.args.ou_theta,
                                       sigma=self.args.ou_sigma)
        elif self.args.student_noise_type == "gauss":
            self.exploration = GaussNoise(
                action_dimension=self.actor_output_dim,
                mu=0.,
                std=self.args.gauss_std)
        else:
            raise ValueError()

    def select_stochastic_action(self, obs, total_ep_count):
        action = self.policy.select_action(obs)
        action = action.cpu().data.numpy().flatten()
        assert not np.isnan(action).any()
        noise = self.exploration.noise()
        action = action + noise
        action = (action).clip(0.0, 1.0)
        return action

    def select_deterministic_action(self, obs):
        action = self.policy.select_action(obs)
        action = action.cpu().data.numpy().flatten()
        assert not np.isnan(action).any()
        action = action.clip(0.0, 1.0)
        return action

    def add_memory(self, obs, new_obs, action, reward, done):
        if self.args.student_train_type == "centralized":
            self.memory.add((obs, new_obs, action, reward, done))
        elif self.args.student_train_type == "independent":
            self.memory.add((obs[self.i_agent], new_obs[self.i_agent],
                             action[self.i_agent], reward[self.i_agent],
                             done[self.i_agent]))
        else:
            raise ValueError()

    def clear_memory(self):
        self.memory.clear()

    def update_policy(self, total_ep_count, agent_n, index):
        if self.args.student_train_type == "centralized":
            assert agent_n is not None
            return self.policy.centralized_train(
                total_ep_count,
                agent_n=agent_n,
                index=index,
                replay_buffer=self.memory,
                iterations=self.args.ep_max_timesteps,
                batch_size=self.args.batch_size,
                discount=self.args.discount,
                tau=self.args.tau,
                policy_freq=self.args.policy_freq)
        elif self.args.student_train_type == "independent":
            return self.policy.train(replay_buffer=self.memory,
                                     iterations=self.args.ep_max_timesteps,
                                     batch_size=self.args.batch_size,
                                     discount=self.args.discount,
                                     tau=self.args.tau,
                                     policy_freq=self.args.policy_freq)
        else:
            raise ValueError()

    def fix_name(self, weight):
        weight_fixed = OrderedDict()
        for k, v in weight.items():
            name_fixed = self.name
            for i_name, name in enumerate(k.split("_")):
                if i_name > 0:
                    name_fixed += "_" + name
            weight_fixed[name_fixed] = v

        return weight_fixed

    def sync(self, target_agent):
        self.log[self.args.log_name].info("[{}] Synced weight".format(
            self.name))

        actor = self.fix_name(target_agent.policy.actor.state_dict())
        self.policy.actor.load_state_dict(actor)

        actor_target = self.fix_name(
            target_agent.policy.actor_target.state_dict())
        self.policy.actor_target.load_state_dict(actor_target)

        critic = self.fix_name(target_agent.policy.critic.state_dict())
        self.policy.critic.load_state_dict(critic)

        critic_target = self.fix_name(
            target_agent.policy.critic_target.state_dict())
        self.policy.critic_target.load_state_dict(critic_target)

        self.policy.actor_optimizer = torch.optim.Adam(
            self.policy.actor.parameters(), lr=self.args.actor_lr)
        self.policy.critic_optimizer = torch.optim.Adam(
            self.policy.critic.parameters(), lr=self.args.critic_lr)

    def get_q_value(self, obs, action):
        obs = torch.FloatTensor(obs.reshape(1, -1)).to(device)
        action = torch.FloatTensor(action.reshape(1, -1)).to(device)

        return self.policy.critic.Q1(obs, action).cpu().data.numpy().flatten()

    def reset(self):
        self.log[self.args.log_name].info("[{}] Reset".format(self.name))
        self.set_policy()
        self.actor_loss_n = []
        self.critic_loss_n = []

    def save_weight(self, filename, directory):
        self.log[self.args.log_name].info("[{}] Saved weight".format(
            self.name))
        self.policy.save(filename, directory)

    def load_weight(self, filename, directory):
        self.log[self.args.log_name].info("[{}] Loaded weight".format(
            self.name))
        self.policy.load(filename, directory)

    def load_model(self, filename, directory):
        self.reset()

        if self.args.load_student_memory:
            self.log[self.args.log_name].info("[{}] Loaded memory".format(
                self.name))
            import pickle

            with open(directory + "/" + filename + ".pkl", "rb") as input_file:
                saved_model = pickle.load(input_file)

            self.actor_loss_n = saved_model["actor_loss_n"]
            self.critic_loss_n = saved_model["critic_loss_n"]
            self.memory.sync(saved_model["memory"])

        self.load_weight(filename, directory)

    def set_eval_mode(self):
        self.log[self.args.log_name].info("[{}] Set eval mode".format(
            self.name))

        self.policy.actor.eval()
        self.policy.actor_target.eval()
        self.policy.critic.eval()
        self.policy.critic_target.eval()

    def save_model(self, avg_eval_reward, total_ep_count):
        import pickle

        def save_pickle(obj, filename):
            with open(filename, "wb") as output:
                pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

        # Filename by converting it to percentage
        filename = \
            self.name + \
            "_reward" + "{:.3f}".format(avg_eval_reward) + \
            "_seed" + str(self.args.seed) + \
            "_ep" + str(total_ep_count)

        # Save loss history & memory
        snipshot = {}
        snipshot["actor_loss_n"] = self.actor_loss_n
        snipshot["critic_loss_n"] = self.critic_loss_n
        snipshot["memory"] = self.memory

        save_pickle(obj=snipshot, filename=filename + ".pkl")

        # Save weight
        self.save_weight(filename=filename, directory="./pytorch_models")
示例#15
0
class Manager(PolicyBase):
    def __init__(self, env, tb_writer, log, args, name, i_agent):
        super(Manager, self).__init__(env=env,
                                      log=log,
                                      tb_writer=tb_writer,
                                      args=args,
                                      name=name,
                                      i_agent=i_agent)

        self.set_dim()
        self.set_policy()
        self.memory = ReplayBuffer()

        assert "manager" in self.name

    def set_dim(self):
        self.actor_input_dim = self.env.observation_space[0].shape[0]
        if self.args.manager_done:
            self.actor_input_dim += 1  # +1 for remaining time in current episode
        self.actor_output_dim = self.env.action_space[0].shape[0]
        self.critic_input_dim = (self.actor_input_dim +
                                 self.actor_output_dim) * self.args.n_manager
        self.max_action = float(self.env.action_space[0].high[0])
        self.n_hidden = self.args.manager_n_hidden

        self.log[self.args.log_name].info("[{}] Actor input dim: {}".format(
            self.name, self.actor_input_dim))
        self.log[self.args.log_name].info("[{}] Actor output dim: {}".format(
            self.name, self.actor_output_dim))
        self.log[self.args.log_name].info("[{}] Critic input dim: {}".format(
            self.name, self.critic_input_dim))
        self.log[self.args.log_name].info("[{}] Max action: {}".format(
            self.name, self.max_action))

    def add_memory(self, obs, new_obs, action, reward, done):
        self.memory.add((obs, new_obs, action, reward, done))

    def select_stochastic_action(self, obs, session_timesteps):
        """Return stochastic action with added noise
        As in TD3, purely random noise is applied followed by Gaussian noise
        Empirically, we found that adding the purely random noise improves 
        stability of the algorithm
        """
        if session_timesteps < self.args.manager_start_timesteps:
            action = self.env.action_space[0].sample()
            assert not np.isnan(action).any()
        else:
            action = self.policy.select_action(obs)
            assert not np.isnan(action).any()
            if self.args.expl_noise != 0:
                noise = np.random.normal(
                    0,
                    self.args.expl_noise,
                    size=self.env.action_space[0].shape[0])
                action = (action + noise).clip(self.env.action_space[0].low,
                                               self.env.action_space[0].high)

        return action

    def update_policy(self, agents, iterations, batch_size, total_timesteps):
        debug = self.policy.centralized_train(
            agents=agents,
            replay_buffer=self.memory,
            iterations=iterations,
            batch_size=batch_size,
            discount=self.args.manager_discount,
            tau=self.args.tau,
            policy_noise=self.args.policy_noise,
            noise_clip=self.args.noise_clip,
            policy_freq=self.args.policy_freq)

        self.tb_writer.add_scalars("loss/actor",
                                   {self.name: debug["actor_loss"]},
                                   total_timesteps)
        self.tb_writer.add_scalars("loss/critic",
                                   {self.name: debug["critic_loss"]},
                                   total_timesteps)

        return debug

    def fix_name(self, weight):
        weight_fixed = OrderedDict()
        for k, v in weight.items():
            name_fixed = self.name
            for i_name, name in enumerate(k.split("_")):
                if i_name > 0:
                    name_fixed += "_" + name
            weight_fixed[name_fixed] = v

        return weight_fixed

    def sync(self, target_agent):
        actor = self.fix_name(target_agent.policy.actor.state_dict())
        self.policy.actor.load_state_dict(actor)

        actor_target = self.fix_name(
            target_agent.policy.actor_target.state_dict())
        self.policy.actor_target.load_state_dict(actor_target)

        critic = self.fix_name(target_agent.policy.critic.state_dict())
        self.policy.critic.load_state_dict(critic)

        critic_target = self.fix_name(
            target_agent.policy.critic_target.state_dict())
        self.policy.critic_target.load_state_dict(critic_target)
示例#16
0
class MetaLearner(object):
    def __init__(self, log, tb_writer, args):
        super(self.__class__, self).__init__()
        self.log = log
        self.tb_writer = tb_writer
        self.args = args
        self.loss_fn = MSELoss()

        self.net = OmniglotNet(self.loss_fn, args).to(device)

        self.fast_net = InnerLoop(self.loss_fn, args).to(device)

        self.opt = Adam(self.net.parameters(), lr=args.meta_lr)
        self.sampler = BatchSampler(args)
        self.memory = ReplayBuffer()

    def meta_update(self, episode_i, ls):
        in_ = episode_i.observations[:, :, 0]
        target = episode_i.rewards[:, :, 0]

        # We use a dummy forward / backward pass to get the correct grads into self.net
        loss, out = forward_pass(self.net, in_, target)

        # Unpack the list of grad dicts
        gradients = {k: sum(d[k] for d in ls) for k in ls[0].keys()}

        # Register a hook on each parameter in the net that replaces the current dummy grad
        # with our grads accumulated across the meta-batch
        hooks = []
        for (k, v) in self.net.named_parameters():

            def get_closure():
                key = k

                def replace_grad(grad):
                    return gradients[key]

                return replace_grad

            hooks.append(v.register_hook(get_closure()))

        # Compute grads for current step, replace with summed gradients as defined by hook
        self.opt.zero_grad()
        loss.backward()

        # Update the net parameters with the accumulated gradient according to optimizer
        self.opt.step()

        # Remove the hooks before next training phase
        for h in hooks:
            h.remove()

    def test(self, i_task, episode_i_):
        predictions_ = []
        for i_agent in range(self.args.n_agent):
            test_net = OmniglotNet(self.loss_fn, self.args).to(device)

            # Make a test net with same parameters as our current net
            test_net.copy_weights(self.net)
            test_opt = SGD(test_net.parameters(), lr=self.args.fast_lr)

            episode_i = self.memory.storage[i_task - 1]

            # Train on the train examples, using the same number of updates as in training
            for i in range(self.args.fast_num_update):
                in_ = episode_i.observations[:, :, i_agent]
                target = episode_i.rewards[:, :, i_agent]
                loss, _ = forward_pass(test_net, in_, target)
                print("loss {} at {}".format(loss, i_task))
                test_opt.zero_grad()
                loss.backward()
                test_opt.step()

            # Evaluate the trained model on train and val examples
            tloss, _ = evaluate(test_net, episode_i, i_agent)
            vloss, prediction_ = evaluate(test_net, episode_i_, i_agent)
            mtr_loss = tloss / 10.
            mval_loss = vloss / 10.

            print('-------------------------')
            print('Meta train:', mtr_loss)
            print('Meta val:', mval_loss)
            print('-------------------------')
            del test_net

            predictions_.append(prediction_)

        visualize(episode_i, episode_i_, predictions_, i_task, self.args)

    def train(self):
        for i_task in range(10000):
            # Sample episode from current task
            self.sampler.reset_task(i_task)
            episodes = self.sampler.sample()

            # Add to memory
            self.memory.add(i_task, episodes)

            # Evaluate on test tasks
            if len(self.memory) > 1:
                self.test(i_task, episodes)

            # Collect a meta batch update
            if len(self.memory) > 2:
                meta_grads = []
                for i in range(self.args.meta_batch_size):
                    if i == 0:
                        episodes_i = self.memory.storage[i_task - 1]
                        episodes_i_ = self.memory.storage[i_task]
                    else:
                        episodes_i, episodes_i_ = self.memory.sample()

                    self.fast_net.copy_weights(self.net)
                    for i_agent in range(self.args.n_agent):
                        meta_grad = self.fast_net.forward(
                            episodes_i, episodes_i_, i_agent)
                        meta_grads.append(meta_grad)

                # Perform the meta update
                self.meta_update(episodes_i, meta_grads)