def __init__(self, env, n_hidden, log, tb_writer, name, args): super(Agent, self).__init__( env=env, n_hidden=n_hidden, log=log, tb_writer=tb_writer, name=name, args=args) self.set_dim() self.set_policy() self.memory = ReplayBuffer()
def set_policy(self): self.policy = NormalMLP(actor_input_dim=self.actor_input_dim, actor_output_dim=self.actor_output_dim, n_hidden=self.args.opponent_n_hidden, max_action=self.max_action, name=self.name + "_actor", args=self.args) self.memory = ReplayBuffer()
def __init__(self, env, tb_writer, log, args, name): super(Agent, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name) self.set_dim() self.set_policy() self.memory = ReplayBuffer() self.epsilon = 1.0 # For exploration
def set_policy(self): self.policy = TD3(actor_input_dim=self.actor_input_dim, actor_output_dim=self.actor_output_dim, critic_input_dim=self.critic_input_dim, n_hidden=self.args.prey_n_hidden, max_action=self.max_action, name=self.name, args=self.args, i_agent=self.i_agent) self.memory = ReplayBuffer()
def set_policy(self): self.policy = DDPG(actor_input_dim=self.actor_input_dim, actor_output_dim=self.actor_output_dim, critic_input_dim=self.critic_input_dim, max_action=self.max_action, min_action=self.min_action, name=self.name, args=self.args, i_agent=self.i_agent, env=self.env) self.memory = ReplayBuffer()
def __init__(self, env, tb_writer, log, args, name, i_agent): super(Manager, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name, i_agent=i_agent) self.set_dim() self.set_policy() self.memory = ReplayBuffer() assert "manager" in self.name
def __init__(self, log, tb_writer, args): super(self.__class__, self).__init__() self.log = log self.tb_writer = tb_writer self.args = args self.loss_fn = MSELoss() self.net = OmniglotNet(self.loss_fn, args).to(device) self.fast_net = InnerLoop(self.loss_fn, args).to(device) self.opt = Adam(self.net.parameters(), lr=args.meta_lr) self.sampler = BatchSampler(args) self.memory = ReplayBuffer()
def __init__(self, env, tb_writer, log, args, name, i_agent): super(Teacher, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name, i_agent=i_agent) self.set_dim() self.set_policy() self.memory = ReplayBuffer() self.tmp_memory = [] self.n_advice = args.session / float(1 + args.n_eval) assert "teacher" in self.name
class Agent(PolicyBase): def __init__(self, env, n_hidden, log, tb_writer, name, args): super(Agent, self).__init__( env=env, n_hidden=n_hidden, log=log, tb_writer=tb_writer, name=name, args=args) self.set_dim() self.set_policy() self.memory = ReplayBuffer() def set_dim(self): self.input_dim = self.env.observation_space.shape[0] self.output_dim = self.env.action_space.n self.log[self.args.log_name].info("[{}] actor input dim: {}".format( self.name, self.input_dim)) self.log[self.args.log_name].info("[{}] actor output dim: {}".format( self.name, self.output_dim)) self.log[self.args.log_name].info("[{}] number of hidden neurons: {}" .format(self.name, self.n_hidden)) def select_stochastic_action(self, obs): # Get probabilities for different actions action, log_prob = self.policy.select_action(obs) assert not np.isnan(action).any() return action, log_prob def clear_memory(self): self.memory.clear() def add_memory(self, reward, log_prob): self.memory.add(reward, log_prob) def update_policy(self, total_eps): debug = self.policy.train( replay_buffer=self.memory, discount=self.args.discount) self.log[self.args.log_name].info( "Training loss: {}".format(debug['loss'])) self.tb_writer.add_scalar( "loss", debug['loss'], total_eps)
class Agent(PolicyBase): def __init__(self, env, tb_writer, log, args, name): super(Agent, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name) self.set_dim() self.set_policy() self.memory = ReplayBuffer() self.epsilon = 1.0 # For exploration def set_dim(self): self.input_dim = self.env.observation_space.spaces[ "semantic_gridmap"].shape self.output_dim = self.env.action_space.n self.log[self.args.log_name].info("[{}] Input dim: {}".format( self.name, self.input_dim)) self.log[self.args.log_name].info("[{}] Output dim: {}".format( self.name, self.output_dim)) def select_deterministic_action(self, obs): action = self.policy.select_action(obs) assert not np.isnan(action).any() return action def select_stochastic_action(self, obs, total_timesteps): if np.random.rand() > self.epsilon: # Exploitation action = self.policy.select_action(obs) else: # Exploration action = np.random.randint(low=0, high=self.output_dim, size=(1, )) if self.epsilon > 0.1: self.epsilon *= 0.99999 # Reduce epsilon over time assert not np.isnan(action).any() self.tb_writer.add_scalar("debug/epsilon", self.epsilon, total_timesteps) return action def add_memory(self, obs, new_obs, action, reward, done): self.memory.add((obs, new_obs, action, reward, done)) def clear_tmp_memory(self): self.tmp_memory.clear() def update_policy(self, total_timesteps): debug = self.policy.train(replay_buffer=self.memory, iterations=50) self.tb_writer.add_scalars("loss/critic", {self.name: debug["critic_loss"]}, total_timesteps) def save(self, episode): self.policy.save("critic_" + str(episode), "./pytorch_models") def load(self, episode): self.policy.load("critic_" + str(episode), "./pytorch_models")
class Prey(object): def __init__(self, env, log, tb_writer, args, name, i_agent): self.env = env self.log = log self.tb_writer = tb_writer self.args = args self.name = name + str(i_agent) self.i_agent = i_agent self.set_dim() self.set_policy() assert "prey" in self.name def set_dim(self): """ NOTE that env.observation_space returns observation space for both predator and prey but with the order: [predator_1, predator_2, ..., prey_1] Thus the index of -1 is used """ self.actor_input_dim = self.env.observation_space[-1].shape[0] self.actor_output_dim = self.env.action_space[0].shape[0] self.critic_input_dim = (self.actor_input_dim + self.actor_output_dim) self.max_action = float(self.env.action_space[0].high[0]) self.log[self.args.log_name].info("[{0}] Actor input dim: {1}".format( self.name, self.actor_input_dim)) self.log[self.args.log_name].info("[{0}] Actor output dim: {1}".format( self.name, self.actor_output_dim)) self.log[self.args.log_name].info("[{0}] Critic input dim: {1}".format( self.name, self.critic_input_dim)) self.log[self.args.log_name].info("[{0}] Max action: {1}".format( self.name, self.max_action)) def set_policy(self): self.policy = TD3(actor_input_dim=self.actor_input_dim, actor_output_dim=self.actor_output_dim, critic_input_dim=self.critic_input_dim, n_hidden=self.args.prey_n_hidden, max_action=self.max_action, name=self.name, args=self.args, i_agent=self.i_agent) self.memory = ReplayBuffer() def select_stochastic_action(self, obs, total_timesteps): if total_timesteps < self.args.start_timesteps: action = self.env.action_space[0].sample() assert not np.isnan(action).any() else: action = self.policy.select_action(obs) assert not np.isnan(action).any() if self.args.expl_noise != 0: noise = np.random.normal( 0, self.args.expl_noise, size=self.env.action_space[0].shape[0]) action = (action + noise).clip(self.env.action_space[0].low, self.env.action_space[0].high) return action def select_deterministic_action(self, obs): action = self.policy.select_action(obs) assert not np.isnan(action).any() return action def add_memory(self, obs, new_obs, action, reward, done): self.memory.add((obs, new_obs, action, reward, done)) def clear_memory(self): self.memory.clear() def update_policy(self, total_eps): debug = self.policy.train(replay_buffer=self.memory, iterations=self.args.ep_max_timesteps, batch_size=self.args.batch_size, discount=self.args.discount, tau=self.args.tau, policy_noise=self.args.policy_noise, noise_clip=self.args.noise_clip, policy_freq=self.args.policy_freq) self.tb_writer.add_scalars("loss/actor_loss", {self.name: debug["actor_loss"]}, total_eps) self.tb_writer.add_scalars("loss/critic_loss", {self.name: debug["critic_loss"]}, total_eps) return debug def fix_name(self, weight): weight_fixed = OrderedDict() for k, v in weight.items(): name_fixed = self.name for i_name, name in enumerate(k.split("_")): if i_name > 0: name_fixed += "_" + name weight_fixed[name_fixed] = v return weight_fixed def sync(self, target_agent): self.log[self.args.log_name].info("[{}] Synced weight".format( self.name)) actor = self.fix_name(target_agent.policy.actor.state_dict()) self.policy.actor.load_state_dict(actor) actor_target = self.fix_name( target_agent.policy.actor_target.state_dict()) self.policy.actor_target.load_state_dict(actor_target) critic = self.fix_name(target_agent.policy.critic.state_dict()) self.policy.critic.load_state_dict(critic) critic_target = self.fix_name( target_agent.policy.critic_target.state_dict()) self.policy.critic_target.load_state_dict(critic_target) self.policy.actor_optimizer = torch.optim.Adam( self.policy.actor.parameters(), lr=self.args.actor_lr) self.policy.critic_optimizer = torch.optim.Adam( self.policy.critic.parameters(), lr=self.args.critic_lr) def get_q_value(self, obs, action): obs = torch.FloatTensor(obs.reshape(1, -1)).to(device) action = torch.FloatTensor(action.reshape(1, -1)).to(device) return self.policy.critic.Q1(obs, action).cpu().data.numpy().flatten() def reset(self): self.log[self.args.log_name].info("[{}] Reset".format(self.name)) self.set_policy() self.actor_loss_n = [] self.critic_loss_n = [] def save_weight(self, filename, directory): self.log[self.args.log_name].info("[{}] Saved weight".format( self.name)) self.policy.save(filename, directory) def load_weight(self, filename, directory): self.log[self.args.log_name].info("[{}] Loaded weight".format( self.name)) self.policy.load(filename, directory) def load_model(self, filename, directory): self.load_weight(filename, directory) def set_eval_mode(self): self.log[self.args.log_name].info("[{}] Set eval mode".format( self.name)) self.policy.actor.eval() self.policy.actor_target.eval() self.policy.critic.eval() self.policy.critic_target.eval() def save_model(self, avg_eval_reward, total_ep_count): import pickle def save_pickle(obj, filename): with open(filename, "wb") as output: pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) # Filename by converting it to percentage filename = \ self.name + \ "_reward" + "{:.3f}".format(avg_eval_reward) + \ "_seed" + str(self.args.seed) + \ "_ep" + str(total_ep_count) # Save loss history & memory snipshot = {} snipshot["actor_loss_n"] = self.actor_loss_n snipshot["critic_loss_n"] = self.critic_loss_n snipshot["memory"] = self.memory save_pickle(obj=snipshot, filename=filename + ".pkl") # Save weight self.save_weight(filename=filename, directory="./pytorch_models")
class Teacher(PolicyBase): def __init__(self, env, tb_writer, log, args, name, i_agent): super(Teacher, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name, i_agent=i_agent) self.set_dim() self.set_policy() self.memory = ReplayBuffer() self.tmp_memory = [] self.n_advice = args.session / float(1 + args.n_eval) assert "teacher" in self.name def set_actor_input_dim(self): input_dim = 0 # Add state (teacher obs, student obs) input_dim += self.env.observation_space[0].shape[0] * 2 if self.args.manager_done: input_dim += 1 * 2 # Add action (teacher action, student action, teacher action at) input_dim += self.env.action_space[0].shape[0] * 3 # Add Q-values (teacher joint Q-value, student joint Q-value) input_dim += 2 * 2 # Add reward mean input_dim += 2 # Add teacher remain time within session input_dim += 1 return input_dim def set_dim(self): self.actor_input_dim = self.set_actor_input_dim() self.actor_output_dim = self.env.action_space[0].shape[ 0] + 2 # +2 for when to advise (one-hot encoding) self.critic_input_dim = (self.actor_input_dim + self.actor_output_dim) * self.args.n_teacher self.max_action = float(self.env.action_space[0].high[0]) self.n_hidden = self.args.teacher_n_hidden self.action_space = spaces.Box( low=-1, high=+1, shape=(self.env.action_space[0].shape[0], ), dtype=np.float32) self.log[self.args.log_name].info("[{}] Actor input dim: {}".format( self.name, self.actor_input_dim)) self.log[self.args.log_name].info("[{}] Actor output dim: {}".format( self.name, self.actor_output_dim)) self.log[self.args.log_name].info("[{}] Critic input dim: {}".format( self.name, self.critic_input_dim)) self.log[self.args.log_name].info("[{}] Max action: {}".format( self.name, self.max_action)) def select_stochastic_action(self, obs, total_timesteps): """Return stochastic action with added noise As in TD3, purely random noise is applied followed by Gaussian noise """ if total_timesteps < self.args.teacher_start_timesteps: action_what = self.action_space.sample() action_when = np.zeros((2, ), dtype=np.float32) action_when[np.random.randint(low=0, high=2, size=(1, ))] = 1 else: action_what, action_when = self.policy.select_action(obs) if self.args.expl_noise != 0: noise = np.random.normal(0, self.args.expl_noise, size=self.action_space.shape[0]) action_what = (action_what + noise).clip( self.action_space.low, self.action_space.high) if np.random.uniform() < 0.03: action_when = np.zeros((2, ), dtype=np.float32) action_when[np.random.randint(low=0, high=2, size=(1, ))] = 1 action = np.concatenate([action_what, action_when]) assert not np.isnan(action).any() return action def update_memory(self, teacher_reward, temp_managers, train_rewards, teacher_rewards): """Update memory The next observation is updated by replacing student Q-values with its updated temporary policy. Average rewards and remaining timestep are also updated. The measured teacher_reward is also updated. """ self.corrected_memory = [[] for _ in range(5) ] # 5: obs, new_obs, action, reward, done i_student = 1 for i_exp, exp in enumerate(self.tmp_memory): # Update student_action obs_dict = exp[-1] # Update q-value that measured using updated student_critic q_values = get_q_values( temp_managers, obs_dict["manager_observations"], [obs_dict["manager_actions"][0], obs_dict["student_action"]]) q_values = np.clip(q_values, a_min=self.args.q_min, a_max=self.args.q_max) obs_dict["q_with_student_critic"] = np.array([ normalize(value=q_values[i_student], min_value=self.args.q_min, max_value=self.args.q_max) ]) q_values = get_q_values(temp_managers, obs_dict["manager_observations"], [ obs_dict["manager_actions"][0], obs_dict["teacher_action_at"] ]) q_values = np.clip(q_values, a_min=self.args.q_min, a_max=self.args.q_max) obs_dict["q_at_with_student_critic"] = np.array([ normalize(value=q_values[i_student], min_value=self.args.q_min, max_value=self.args.q_max) ]) # Update avg_reward # Note that avg_train_reward = R_{Phase I} # Note that avg_teacher_reward = R_{Phase II} avg_train_reward, avg_teacher_reward = get_avg_reward( train_rewards=train_rewards, teacher_rewards=teacher_rewards, args=self.args) obs_dict["avg_train_reward"] = np.array([avg_train_reward]) obs_dict["avg_teacher_reward"] = np.array([avg_teacher_reward]) # Update teacher remain timestep obs_dict["remain_time"] = np.array([ normalize(value=(self.n_advice - (obs_dict["session_advices"] + 1)), min_value=0., max_value=float(self.n_advice)) ]) new_obs = concat_in_order(obs_dict, self.args) self.corrected_memory[0].append(exp[0]) self.corrected_memory[1].append(new_obs) self.corrected_memory[2].append(exp[2]) self.corrected_memory[3].append(teacher_reward) self.corrected_memory[4].append(exp[4]) self.add_memory() self.clear_tmp_memory() def add_memory(self): self.memory.add(self.corrected_memory) def keep_memory(self, obs, new_obs, action, reward, done, obs_dict): self.tmp_memory.append([obs, new_obs, action, reward, done, obs_dict]) def clear_tmp_memory(self): self.tmp_memory.clear() def update_policy(self, batch_size, total_timesteps): if len(self.memory) > self.args.ep_max_timesteps: debug = self.policy.train_teacher( replay_buffer=self.memory, iterations=self.args.ep_max_timesteps, batch_size=batch_size, discount=self.args.teacher_discount, tau=self.args.tau, policy_noise=self.args.policy_noise, noise_clip=self.args.noise_clip, policy_freq=self.args.policy_freq) self.log[self.args.log_name].info( "[{0}] Teacher actor loss {1} at {2}".format( self.name, debug["actor_loss"], total_timesteps)) self.tb_writer.add_scalars("loss/actor", {self.name: debug["actor_loss"]}, total_timesteps) self.log[self.args.log_name].info( "[{0}] Teacher critic loss {1} at {2}".format( self.name, debug["critic_loss"], total_timesteps)) self.tb_writer.add_scalars("loss/critic", {self.name: debug["critic_loss"]}, total_timesteps)
class Agent(PolicyBase): def __init__(self, env, tb_writer, log, args, name): super(Agent, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name) self.set_dim() self.set_policy() self.memory = ReplayBuffer() self.epsilon = 1 # For exploration def set_dim(self): self.actor_input_dim = self.env.observation_space.shape[0] self.actor_output_dim = self.env.action_space.n self.critic_input_dim = self.actor_input_dim + self.actor_output_dim self.n_hidden = self.args.n_hidden self.log[self.args.log_name].info("[{}] Actor input dim: {}".format( self.name, self.actor_input_dim)) self.log[self.args.log_name].info("[{}] Actor output dim: {}".format( self.name, self.actor_output_dim)) self.log[self.args.log_name].info("[{}] Critic input dim: {}".format( self.name, self.critic_input_dim)) def select_deterministic_action(self, obs): action = self.policy.select_action(obs) assert not np.isnan(action).any() return action def select_stochastic_action(self, obs, total_timesteps): if np.random.rand() > self.epsilon: action = self.policy.select_action(obs) else: action = np.zeros((self.args.n_action, ), dtype=np.float32) action[np.random.randint(low=0, high=self.args.n_action, size=(1, ))] = 1 if self.epsilon > 0.05: self.epsilon *= 0.9999 # Reduce epsilon over time assert not np.isnan(action).any() self.tb_writer.add_scalar("debug/epsilon", self.epsilon, total_timesteps) return action def add_memory(self, obs, new_obs, action, reward, done): self.memory.add((obs, new_obs, action, reward, done)) def clear_tmp_memory(self): self.tmp_memory.clear() def update_policy(self, total_timesteps): if len(self.memory) > self.args.ep_max_timesteps: debug = self.policy.train(replay_buffer=self.memory, iterations=self.args.ep_max_timesteps, batch_size=self.args.batch_size, discount=self.args.discount, tau=self.args.tau, policy_freq=self.args.policy_freq) self.tb_writer.add_scalars("loss/actor", {self.name: debug["actor_loss"]}, total_timesteps) self.tb_writer.add_scalars("loss/critic", {self.name: debug["critic_loss"]}, total_timesteps)
class Student(object): def __init__(self, env, log, args, name, i_agent): self.env = env self.log = log self.args = args self.name = name + str(i_agent) self.i_agent = i_agent self.set_dim() self.set_policy() self.set_noise() assert "student" in self.name def set_dim(self): self.actor_input_dim = len(self.env.agents[0].state) if self.args.student_done: self.actor_input_dim += 1 self.actor_output_dim = 2 self.critic_input_dim = self.actor_input_dim + self.actor_output_dim if self.args.student_train_type == "centralized": self.critic_input_dim += self.actor_output_dim self.critic_input_dim += (len(self.env.agents) - 1) * self.actor_input_dim self.max_action = float(1.0) self.min_action = float(0.0) self.log[self.args.log_name].info("[{}] Actor input dim: {}".format( self.name, self.actor_input_dim)) self.log[self.args.log_name].info("[{}] Actor output dim: {}".format( self.name, self.actor_output_dim)) self.log[self.args.log_name].info("[{}] Critic input dim: {}".format( self.name, self.critic_input_dim)) self.log[self.args.log_name].info("[{}] Max action: {}".format( self.name, self.max_action)) def set_policy(self): self.policy = DDPG(actor_input_dim=self.actor_input_dim, actor_output_dim=self.actor_output_dim, critic_input_dim=self.critic_input_dim, max_action=self.max_action, min_action=self.min_action, name=self.name, args=self.args, i_agent=self.i_agent, env=self.env) self.memory = ReplayBuffer() def set_noise(self): if self.args.student_noise_type == "ou": self.exploration = OUNoise(action_dimension=self.actor_output_dim, theta=self.args.ou_theta, sigma=self.args.ou_sigma) elif self.args.student_noise_type == "gauss": self.exploration = GaussNoise( action_dimension=self.actor_output_dim, mu=0., std=self.args.gauss_std) else: raise ValueError() def select_stochastic_action(self, obs, total_ep_count): action = self.policy.select_action(obs) action = action.cpu().data.numpy().flatten() assert not np.isnan(action).any() noise = self.exploration.noise() action = action + noise action = (action).clip(0.0, 1.0) return action def select_deterministic_action(self, obs): action = self.policy.select_action(obs) action = action.cpu().data.numpy().flatten() assert not np.isnan(action).any() action = action.clip(0.0, 1.0) return action def add_memory(self, obs, new_obs, action, reward, done): if self.args.student_train_type == "centralized": self.memory.add((obs, new_obs, action, reward, done)) elif self.args.student_train_type == "independent": self.memory.add((obs[self.i_agent], new_obs[self.i_agent], action[self.i_agent], reward[self.i_agent], done[self.i_agent])) else: raise ValueError() def clear_memory(self): self.memory.clear() def update_policy(self, total_ep_count, agent_n, index): if self.args.student_train_type == "centralized": assert agent_n is not None return self.policy.centralized_train( total_ep_count, agent_n=agent_n, index=index, replay_buffer=self.memory, iterations=self.args.ep_max_timesteps, batch_size=self.args.batch_size, discount=self.args.discount, tau=self.args.tau, policy_freq=self.args.policy_freq) elif self.args.student_train_type == "independent": return self.policy.train(replay_buffer=self.memory, iterations=self.args.ep_max_timesteps, batch_size=self.args.batch_size, discount=self.args.discount, tau=self.args.tau, policy_freq=self.args.policy_freq) else: raise ValueError() def fix_name(self, weight): weight_fixed = OrderedDict() for k, v in weight.items(): name_fixed = self.name for i_name, name in enumerate(k.split("_")): if i_name > 0: name_fixed += "_" + name weight_fixed[name_fixed] = v return weight_fixed def sync(self, target_agent): self.log[self.args.log_name].info("[{}] Synced weight".format( self.name)) actor = self.fix_name(target_agent.policy.actor.state_dict()) self.policy.actor.load_state_dict(actor) actor_target = self.fix_name( target_agent.policy.actor_target.state_dict()) self.policy.actor_target.load_state_dict(actor_target) critic = self.fix_name(target_agent.policy.critic.state_dict()) self.policy.critic.load_state_dict(critic) critic_target = self.fix_name( target_agent.policy.critic_target.state_dict()) self.policy.critic_target.load_state_dict(critic_target) self.policy.actor_optimizer = torch.optim.Adam( self.policy.actor.parameters(), lr=self.args.actor_lr) self.policy.critic_optimizer = torch.optim.Adam( self.policy.critic.parameters(), lr=self.args.critic_lr) def get_q_value(self, obs, action): obs = torch.FloatTensor(obs.reshape(1, -1)).to(device) action = torch.FloatTensor(action.reshape(1, -1)).to(device) return self.policy.critic.Q1(obs, action).cpu().data.numpy().flatten() def reset(self): self.log[self.args.log_name].info("[{}] Reset".format(self.name)) self.set_policy() self.actor_loss_n = [] self.critic_loss_n = [] def save_weight(self, filename, directory): self.log[self.args.log_name].info("[{}] Saved weight".format( self.name)) self.policy.save(filename, directory) def load_weight(self, filename, directory): self.log[self.args.log_name].info("[{}] Loaded weight".format( self.name)) self.policy.load(filename, directory) def load_model(self, filename, directory): self.reset() if self.args.load_student_memory: self.log[self.args.log_name].info("[{}] Loaded memory".format( self.name)) import pickle with open(directory + "/" + filename + ".pkl", "rb") as input_file: saved_model = pickle.load(input_file) self.actor_loss_n = saved_model["actor_loss_n"] self.critic_loss_n = saved_model["critic_loss_n"] self.memory.sync(saved_model["memory"]) self.load_weight(filename, directory) def set_eval_mode(self): self.log[self.args.log_name].info("[{}] Set eval mode".format( self.name)) self.policy.actor.eval() self.policy.actor_target.eval() self.policy.critic.eval() self.policy.critic_target.eval() def save_model(self, avg_eval_reward, total_ep_count): import pickle def save_pickle(obj, filename): with open(filename, "wb") as output: pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) # Filename by converting it to percentage filename = \ self.name + \ "_reward" + "{:.3f}".format(avg_eval_reward) + \ "_seed" + str(self.args.seed) + \ "_ep" + str(total_ep_count) # Save loss history & memory snipshot = {} snipshot["actor_loss_n"] = self.actor_loss_n snipshot["critic_loss_n"] = self.critic_loss_n snipshot["memory"] = self.memory save_pickle(obj=snipshot, filename=filename + ".pkl") # Save weight self.save_weight(filename=filename, directory="./pytorch_models")
class Manager(PolicyBase): def __init__(self, env, tb_writer, log, args, name, i_agent): super(Manager, self).__init__(env=env, log=log, tb_writer=tb_writer, args=args, name=name, i_agent=i_agent) self.set_dim() self.set_policy() self.memory = ReplayBuffer() assert "manager" in self.name def set_dim(self): self.actor_input_dim = self.env.observation_space[0].shape[0] if self.args.manager_done: self.actor_input_dim += 1 # +1 for remaining time in current episode self.actor_output_dim = self.env.action_space[0].shape[0] self.critic_input_dim = (self.actor_input_dim + self.actor_output_dim) * self.args.n_manager self.max_action = float(self.env.action_space[0].high[0]) self.n_hidden = self.args.manager_n_hidden self.log[self.args.log_name].info("[{}] Actor input dim: {}".format( self.name, self.actor_input_dim)) self.log[self.args.log_name].info("[{}] Actor output dim: {}".format( self.name, self.actor_output_dim)) self.log[self.args.log_name].info("[{}] Critic input dim: {}".format( self.name, self.critic_input_dim)) self.log[self.args.log_name].info("[{}] Max action: {}".format( self.name, self.max_action)) def add_memory(self, obs, new_obs, action, reward, done): self.memory.add((obs, new_obs, action, reward, done)) def select_stochastic_action(self, obs, session_timesteps): """Return stochastic action with added noise As in TD3, purely random noise is applied followed by Gaussian noise Empirically, we found that adding the purely random noise improves stability of the algorithm """ if session_timesteps < self.args.manager_start_timesteps: action = self.env.action_space[0].sample() assert not np.isnan(action).any() else: action = self.policy.select_action(obs) assert not np.isnan(action).any() if self.args.expl_noise != 0: noise = np.random.normal( 0, self.args.expl_noise, size=self.env.action_space[0].shape[0]) action = (action + noise).clip(self.env.action_space[0].low, self.env.action_space[0].high) return action def update_policy(self, agents, iterations, batch_size, total_timesteps): debug = self.policy.centralized_train( agents=agents, replay_buffer=self.memory, iterations=iterations, batch_size=batch_size, discount=self.args.manager_discount, tau=self.args.tau, policy_noise=self.args.policy_noise, noise_clip=self.args.noise_clip, policy_freq=self.args.policy_freq) self.tb_writer.add_scalars("loss/actor", {self.name: debug["actor_loss"]}, total_timesteps) self.tb_writer.add_scalars("loss/critic", {self.name: debug["critic_loss"]}, total_timesteps) return debug def fix_name(self, weight): weight_fixed = OrderedDict() for k, v in weight.items(): name_fixed = self.name for i_name, name in enumerate(k.split("_")): if i_name > 0: name_fixed += "_" + name weight_fixed[name_fixed] = v return weight_fixed def sync(self, target_agent): actor = self.fix_name(target_agent.policy.actor.state_dict()) self.policy.actor.load_state_dict(actor) actor_target = self.fix_name( target_agent.policy.actor_target.state_dict()) self.policy.actor_target.load_state_dict(actor_target) critic = self.fix_name(target_agent.policy.critic.state_dict()) self.policy.critic.load_state_dict(critic) critic_target = self.fix_name( target_agent.policy.critic_target.state_dict()) self.policy.critic_target.load_state_dict(critic_target)
class MetaLearner(object): def __init__(self, log, tb_writer, args): super(self.__class__, self).__init__() self.log = log self.tb_writer = tb_writer self.args = args self.loss_fn = MSELoss() self.net = OmniglotNet(self.loss_fn, args).to(device) self.fast_net = InnerLoop(self.loss_fn, args).to(device) self.opt = Adam(self.net.parameters(), lr=args.meta_lr) self.sampler = BatchSampler(args) self.memory = ReplayBuffer() def meta_update(self, episode_i, ls): in_ = episode_i.observations[:, :, 0] target = episode_i.rewards[:, :, 0] # We use a dummy forward / backward pass to get the correct grads into self.net loss, out = forward_pass(self.net, in_, target) # Unpack the list of grad dicts gradients = {k: sum(d[k] for d in ls) for k in ls[0].keys()} # Register a hook on each parameter in the net that replaces the current dummy grad # with our grads accumulated across the meta-batch hooks = [] for (k, v) in self.net.named_parameters(): def get_closure(): key = k def replace_grad(grad): return gradients[key] return replace_grad hooks.append(v.register_hook(get_closure())) # Compute grads for current step, replace with summed gradients as defined by hook self.opt.zero_grad() loss.backward() # Update the net parameters with the accumulated gradient according to optimizer self.opt.step() # Remove the hooks before next training phase for h in hooks: h.remove() def test(self, i_task, episode_i_): predictions_ = [] for i_agent in range(self.args.n_agent): test_net = OmniglotNet(self.loss_fn, self.args).to(device) # Make a test net with same parameters as our current net test_net.copy_weights(self.net) test_opt = SGD(test_net.parameters(), lr=self.args.fast_lr) episode_i = self.memory.storage[i_task - 1] # Train on the train examples, using the same number of updates as in training for i in range(self.args.fast_num_update): in_ = episode_i.observations[:, :, i_agent] target = episode_i.rewards[:, :, i_agent] loss, _ = forward_pass(test_net, in_, target) print("loss {} at {}".format(loss, i_task)) test_opt.zero_grad() loss.backward() test_opt.step() # Evaluate the trained model on train and val examples tloss, _ = evaluate(test_net, episode_i, i_agent) vloss, prediction_ = evaluate(test_net, episode_i_, i_agent) mtr_loss = tloss / 10. mval_loss = vloss / 10. print('-------------------------') print('Meta train:', mtr_loss) print('Meta val:', mval_loss) print('-------------------------') del test_net predictions_.append(prediction_) visualize(episode_i, episode_i_, predictions_, i_task, self.args) def train(self): for i_task in range(10000): # Sample episode from current task self.sampler.reset_task(i_task) episodes = self.sampler.sample() # Add to memory self.memory.add(i_task, episodes) # Evaluate on test tasks if len(self.memory) > 1: self.test(i_task, episodes) # Collect a meta batch update if len(self.memory) > 2: meta_grads = [] for i in range(self.args.meta_batch_size): if i == 0: episodes_i = self.memory.storage[i_task - 1] episodes_i_ = self.memory.storage[i_task] else: episodes_i, episodes_i_ = self.memory.sample() self.fast_net.copy_weights(self.net) for i_agent in range(self.args.n_agent): meta_grad = self.fast_net.forward( episodes_i, episodes_i_, i_agent) meta_grads.append(meta_grad) # Perform the meta update self.meta_update(episodes_i, meta_grads)