def __init__(self, env_name, n_env, acmodel, demo_loc, version, es_method=2, update_frequency=10, transfer_ratio=0.15, random_walk_length=1, curr_method='one', num_frames_per_proc=None, discount=0.99, lr=7e-4, beta1=0.9, beta2=0.999, gae_lambda=0.95, entropy_coef=0.01, value_loss_coef=0.5, max_grad_norm=0.5, recurrence=4, adam_eps=1e-5, clip_eps=0.2, epochs=4, batch_size=256, preprocess_obss=None, reshape_reward=None, aux_info=None): self.n_env = n_env self.env_name = env_name self.transfer_ratio = transfer_ratio self.random_walk_length = random_walk_length self.version = version self.update_frequency = update_frequency self.es_method = es_method super().__init__([gym.make(env_name) for _ in range(n_env)], acmodel, num_frames_per_proc, discount, lr, beta1, beta2, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, adam_eps, clip_eps, epochs, batch_size, preprocess_obss, reshape_reward, aux_info) if version == "v1": self.good_start_states = self.read_good_start_states( env_name, demo_loc) elif version == "v2" or version == "v3": self.read_good_start_states_v2(env_name, demo_loc, curr_method) self.env = None self.env = RCParallelEnv(self.env_name, self.n_env, demo_loc, curr_method) self.obs = self.env.reset() self.update = 0 self.curr_update = 1 self.log_history = [] self.es_max = -1 self.es_pat = 0 self.curr_done = False self.curr_really_done = False
class BaseAlgo(ABC): """The base class for RL algorithms.""" def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input aux_info : list a list of strings corresponding to the name of the extra information retrieved from the environment for supervised auxiliary losses """ # Store parameters self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward self.aux_info = aux_info # Store helpers values self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None]*(shape[0]) self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) if self.aux_info: self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs self.found_true = False def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ from pdb import set_trace as st for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): from pdb import set_trace as st st() model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) dist = model_results['dist'] value = model_results['value'] memory = model_results['memory'] extra_predictions = model_results['extra_predictions'] action = dist.sample() obs, reward, done, env_info = self.env.step(action.cpu().numpy()) if self.aux_info: env_info = self.aux_info_collector.process(env_info) # env_info = self.process_aux_info(env_info) # Update experiences values self.obss[i] = self.obs self.obs = obs self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip(obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) if self.aux_info: self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item()) self.log_num_frames.append(self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value'] for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i] self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps = DictList() exps.obs = [self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc)] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) if self.aux_info: exps = self.aux_info_collector.end_collection(exps) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter, } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log @abstractmethod def update_parameters(self): pass
def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input aux_info : list a list of strings corresponding to the name of the extra information retrieved from the environment for supervised auxiliary losses """ # Store parameters self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward self.aux_info = aux_info # Store helpers values self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None]*(shape[0]) self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) if self.aux_info: self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs self.found_true = False
class BaseAlgo(ABC): """The base class for RL algorithms.""" def __init__(self, envs0, envs1, acmodel0, acmodel1, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input aux_info : list a list of strings corresponding to the name of the extra information retrieved from the environment for supervised auxiliary losses """ # Store parameters self.env0 = ParallelEnv(envs0) self.acmodel0 = acmodel0 self.acmodel0.train() self.env1 = ParallelEnv(envs1) self.acmodel1 = acmodel1 self.acmodel1.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward self.aux_info = aux_info # Store helpers values self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs0) self.num_frames = self.num_frames_per_proc * self.num_procs assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs0 = self.env0.reset() self.obss0 = [None] * (shape[0]) self.obs1 = self.env1.reset() self.obss1 = [None] * (shape[0]) self.memory0 = torch.zeros(shape[1], self.acmodel0.memory_size, device=self.device) self.memories0 = torch.zeros(*shape, self.acmodel0.memory_size, device=self.device) self.memory1 = torch.zeros(shape[1], self.acmodel1.memory_size, device=self.device) self.memories1 = torch.zeros(*shape, self.acmodel1.memory_size, device=self.device) self.msg0 = torch.zeros(self.acmodel0.max_len_msg, shape[1], self.acmodel0.num_symbols, device=self.device) self.msgs0 = torch.zeros(shape[0], self.acmodel0.max_len_msg, shape[1], self.acmodel0.num_symbols, device=self.device) self.msg1 = torch.zeros(self.acmodel1.max_len_msg, shape[1], self.acmodel1.num_symbols, device=self.device) self.msgs1 = torch.zeros(shape[0], self.acmodel1.max_len_msg, shape[1], self.acmodel1.num_symbols, device=self.device) self.msgs_out0 = torch.zeros(shape[0], self.acmodel0.max_len_msg, shape[1], self.acmodel0.num_symbols, device=self.device) self.msgs_out1 = torch.zeros(shape[0], self.acmodel1.max_len_msg, shape[1], self.acmodel1.num_symbols, device=self.device) #self.rng_states0 = torch.zeros(*shape, *torch.get_rng_state().shape, dtype=torch.uint8) #if torch.cuda.is_available(): # self.cuda_rng_states0 = torch.zeros(*shape, *torch.cuda.get_rng_state().shape, dtype=torch.uint8) #self.rng_states1 = torch.zeros(*shape, *torch.get_rng_state().shape, dtype=torch.uint8) #if torch.cuda.is_available(): # self.cuda_rng_states1 = torch.zeros(*shape, *torch.cuda.get_rng_state().shape, dtype=torch.uint8) self.mask0 = torch.ones(shape[1], device=self.device) self.masks0 = torch.zeros(*shape, device=self.device) self.actions0 = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values0 = torch.zeros(*shape, device=self.device) self.rewards0 = torch.zeros(*shape, device=self.device) self.advantages0 = torch.zeros(*shape, device=self.device) self.log_probs0 = torch.zeros(*shape, device=self.device) self.speaker_log_probs0 = torch.zeros(*shape, device=self.device) self.mask1 = torch.ones(shape[1], device=self.device) self.masks1 = torch.zeros(*shape, device=self.device) self.actions1 = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values1 = torch.zeros(*shape, device=self.device) self.rewards1 = torch.zeros(*shape, device=self.device) self.advantages1 = torch.zeros(*shape, device=self.device) self.log_probs1 = torch.zeros(*shape, device=self.device) self.speaker_log_probs1 = torch.zeros(*shape, device=self.device) if self.aux_info: self.aux_info_collector0 = ExtraInfoCollector( self.aux_info, shape, self.device) self.aux_info_collector1 = ExtraInfoCollector( self.aux_info, shape, self.device) # Initialize log values self.log_episode_return0 = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return0 = torch.zeros(self.num_procs, device=self.device) self.log_episode_return1 = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return1 = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames0 = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames1 = torch.zeros(self.num_procs, device=self.device) self.log_done_counter0 = 0 self.log_return0 = [0] * self.num_procs self.log_reshaped_return0 = [0] * self.num_procs self.log_num_frames0 = [0] * self.num_procs self.log_done_counter1 = 0 self.log_return1 = [0] * self.num_procs self.log_reshaped_return1 = [0] * self.num_procs self.log_num_frames1 = [0] * self.num_procs self.been_done0 = torch.zeros(self.num_procs, device=self.device) self.been_done1 = torch.zeros(self.num_procs, device=self.device) def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs0 = self.preprocess_obss(self.obs0, device=self.device) preprocessed_obs1 = self.preprocess_obss(self.obs1, device=self.device) with torch.no_grad(): model_results0 = self.acmodel0( preprocessed_obs1, self.memory0 * self.mask0.unsqueeze(1)) ### NOTE dist0 = model_results0['dist'] ### NOTE value0 = model_results0['value'] memory0 = model_results0['memory'] msg0 = model_results0['message'] dists_speaker0 = model_results0['dists_speaker'] extra_predictions0 = model_results0['extra_predictions'] #self.rng_states0[i] = model_results0['rng_states'] #if torch.cuda.is_available(): # self.cuda_rng_states0[i] = model_results0['cuda_rng_states'] preprocessed_obs0.instr *= 0 preprocessed_obs0.image *= 0 model_results1 = self.acmodel1( preprocessed_obs0, self.memory1 * self.mask1.unsqueeze(1), msg=(msg0.transpose(0, 1) * self.mask1.unsqueeze(1).unsqueeze(2)).transpose( 0, 1)) ### NOTE dist1 = model_results1['dist'] value1 = model_results1['value'] memory1 = model_results1['memory'] msg1 = model_results1['message'] dists_speaker1 = model_results1['dists_speaker'] extra_predictions1 = model_results1['extra_predictions'] #self.rng_states1[i] = model_results1['rng_states'] #if torch.cuda.is_available(): # self.cuda_rng_states1[i] = model_results1['cuda_rng_states'] #state = torch.get_rng_state() action0 = dist0.sample() #torch.set_rng_state(state) action1 = dist1.sample() obs0, reward0, done0, env_info0 = self.env0.step( action0.cpu().numpy()) obs1, reward1, done1, env_info1 = self.env1.step( action1.cpu().numpy()) # mask any rewards based on (previous) been_done rewardos0 = [0] * self.num_procs rewardos1 = [0] * self.num_procs for j in range(self.num_procs): rewardos0[j] = reward0[j] * (1 - self.been_done0[j].item()) rewardos1[j] = reward1[j] * (1 - self.been_done1[j].item()) reward0 = tuple(rewardos0) reward1 = tuple(rewardos1) #reward0 = tuple(0.5*r0 + 0.5*r1 for r0, r1 in zip(reward0, reward1)) ### NOTE #reward1 = reward0 # reward sender agent (0) equally for success of receiver agent (1) ### NOTE reward0 = reward1 self.been_done0 = (1 - (1 - self.been_done0) * (1 - torch.tensor( done0, device=self.device, dtype=torch.float))) self.been_done1 = (1 - (1 - self.been_done1) * (1 - torch.tensor( done1, device=self.device, dtype=torch.float))) both_done = self.been_done0 * self.been_done1 # reset if receiver agent (1) is done ### NOTE both_done = self.been_done1 obs0 = self.env0.sync_reset(both_done, obs0) obs1 = self.env1.sync_reset(both_done, obs1) if self.aux_info: env_info0 = self.aux_info_collector0.process(env_info0) # env_info0 = self.process_aux_info0(env_info0) env_info1 = self.aux_info_collector1.process(env_info1) # env_info1 = self.process_aux_info1(env_info1) # Update experiences values self.obss0[i] = self.obs0 self.obs0 = obs0 self.obss1[i] = self.obs1 self.obs1 = obs1 self.memories0[i] = self.memory0 self.memory0 = memory0 self.memories1[i] = self.memory1 self.memory1 = memory1 self.msgs0[i] = self.msg0 self.msg0 = msg0 self.msgs1[i] = self.msg1 self.msg1 = msg1 self.msgs_out0[i] = msg0 self.msgs_out1[i] = msg1 self.masks0[i] = self.mask0 #self.mask0 = 1 - torch.tensor(done0, device=self.device, dtype=torch.float) self.mask0 = 1 - both_done self.actions0[i] = action0 self.values0[i] = value0 if self.reshape_reward is not None: self.rewards0[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs0, action0, reward0, done0) ], device=self.device) else: self.rewards0[i] = torch.tensor(reward0, device=self.device) self.log_probs0[i] = dist0.log_prob(action0) self.speaker_log_probs0[i] = self.acmodel0.speaker_log_prob( dists_speaker0, msg0) self.masks1[i] = self.mask1 #self.mask1 = 1 - torch.tensor(done1, device=self.device, dtype=torch.float) self.mask1 = 1 - both_done self.actions1[i] = action1 self.values1[i] = value1 if self.reshape_reward is not None: self.rewards1[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs1, action1, reward1, done1) ], device=self.device) else: self.rewards1[i] = torch.tensor(reward1, device=self.device) self.log_probs1[i] = dist1.log_prob(action1) self.speaker_log_probs1[i] = self.acmodel1.speaker_log_prob( dists_speaker1, msg1) if self.aux_info: self.aux_info_collector0.fill_dictionaries( i, env_info0, extra_predictions0) self.aux_info_collector1.fill_dictionaries( i, env_info1, extra_predictions1) # Update log values self.log_episode_return0 += torch.tensor(reward0, device=self.device, dtype=torch.float) self.log_episode_reshaped_return0 += self.rewards0[i] self.log_episode_return1 += torch.tensor(reward1, device=self.device, dtype=torch.float) self.log_episode_reshaped_return1 += self.rewards1[i] self.log_episode_num_frames0 += torch.ones(self.num_procs, device=self.device) self.log_episode_num_frames1 += torch.ones(self.num_procs, device=self.device) #for i, done_ in enumerate(done0): for i in range(self.num_procs): #if done_: if both_done[i]: self.log_done_counter0 += 1 self.log_return0.append(self.log_episode_return0[i].item()) self.log_reshaped_return0.append( self.log_episode_reshaped_return0[i].item()) self.log_num_frames0.append( self.log_episode_num_frames0[i].item()) #for i, done_ in enumerate(done1): #if done_: self.log_done_counter1 += 1 self.log_return1.append(self.log_episode_return1[i].item()) self.log_reshaped_return1.append( self.log_episode_reshaped_return1[i].item()) self.log_num_frames1.append( self.log_episode_num_frames1[i].item()) # if both are done, reset both to not done self.been_done0 *= (1 - both_done) self.been_done1 *= (1 - both_done) self.log_episode_return0 *= self.mask0 self.log_episode_reshaped_return0 *= self.mask0 self.log_episode_num_frames0 *= self.mask0 self.log_episode_return1 *= self.mask1 self.log_episode_reshaped_return1 *= self.mask1 self.log_episode_num_frames1 *= self.mask1 # Add advantage and return to experiences preprocessed_obs0 = self.preprocess_obss(self.obs0, device=self.device) preprocessed_obs1 = self.preprocess_obss(self.obs1, device=self.device) with torch.no_grad(): tmp = self.acmodel0(preprocessed_obs1, self.memory0 * self.mask0.unsqueeze(1)) ### NOTE next_value0 = tmp['value'] preprocessed_obs0.instr *= 0 preprocessed_obs0.image *= 0 next_value1 = self.acmodel1( preprocessed_obs0, self.memory1 * self.mask1.unsqueeze(1), msg=(tmp['message'].transpose(0, 1) * self.mask1.unsqueeze(1).unsqueeze(2)).transpose( 0, 1))['value'] ### NOTE for i in reversed(range(self.num_frames_per_proc)): next_mask0 = self.masks0[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask0 next_value0 = self.values0[ i + 1] if i < self.num_frames_per_proc - 1 else next_value0 next_advantage0 = self.advantages0[ i + 1] if i < self.num_frames_per_proc - 1 else 0 next_mask1 = self.masks1[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask1 next_value1 = self.values1[ i + 1] if i < self.num_frames_per_proc - 1 else next_value1 next_advantage1 = self.advantages1[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta0 = self.rewards0[ i] + self.discount * next_value0 * next_mask0 - self.values0[i] self.advantages0[ i] = delta0 + self.discount * self.gae_lambda * next_advantage0 * next_mask0 delta1 = self.rewards1[ i] + self.discount * next_value1 * next_mask1 - self.values1[i] self.advantages1[ i] = delta1 + self.discount * self.gae_lambda * next_advantage1 * next_mask1 # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps0 = DictList() exps0.obs = [ self.obss0[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] exps1 = DictList() exps1.obs = [ self.obss1[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps0.memory = self.memories0.transpose(0, 1).reshape( -1, *self.memories0.shape[2:]) exps1.memory = self.memories1.transpose(0, 1).reshape( -1, *self.memories1.shape[2:]) exps0.message = self.msgs0.transpose(1, 2).transpose(0, 1).reshape( -1, self.acmodel0.max_len_msg, self.acmodel0.num_symbols) exps1.message = self.msgs1.transpose(1, 2).transpose(0, 1).reshape( -1, self.acmodel1.max_len_msg, self.acmodel1.num_symbols) exps0.message_out = self.msgs_out0.transpose(1, 2).transpose( 0, 1).reshape(-1, self.acmodel0.max_len_msg, self.acmodel0.num_symbols) exps1.message_out = self.msgs_out1.transpose(1, 2).transpose( 0, 1).reshape(-1, self.acmodel1.max_len_msg, self.acmodel1.num_symbols) #exps0.rng_states = self.rng_states0.transpose(0, 1).reshape(-1, *self.rng_states0.shape[2:]) #if torch.cuda.is_available(): # exps0.cuda_rng_states = self.cuda_rng_states0.transpose(0, 1).reshape(-1, *self.cuda_rng_states0.shape[2:]) #exps1.rng_states = self.rng_states1.transpose(0, 1).reshape(-1, *self.rng_states1.shape[2:]) #if torch.cuda.is_available(): # exps1.cuda_rng_states = self.cuda_rng_states1.transpose(0, 1).reshape(-1, *self.cuda_rng_states1.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps0.mask = self.masks0.transpose(0, 1).reshape(-1).unsqueeze(1) exps1.mask = self.masks1.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps0.action = self.actions0.transpose(0, 1).reshape(-1) exps0.value = self.values0.transpose(0, 1).reshape(-1) exps0.reward = self.rewards0.transpose(0, 1).reshape(-1) exps0.advantage = self.advantages0.transpose(0, 1).reshape(-1) exps0.returnn = exps0.value + exps0.advantage exps0.log_prob = self.log_probs0.transpose(0, 1).reshape(-1) exps0.speaker_log_prob = self.speaker_log_probs0.transpose( 0, 1).reshape(-1) exps1.action = self.actions1.transpose(0, 1).reshape(-1) exps1.value = self.values1.transpose(0, 1).reshape(-1) exps1.reward = self.rewards1.transpose(0, 1).reshape(-1) exps1.advantage = self.advantages1.transpose(0, 1).reshape(-1) exps1.returnn = exps1.value + exps1.advantage exps1.log_prob = self.log_probs1.transpose(0, 1).reshape(-1) exps1.speaker_log_prob = self.speaker_log_probs1.transpose( 0, 1).reshape(-1) if self.aux_info: exps0 = self.aux_info_collector0.end_collection(exps0) exps1 = self.aux_info_collector1.end_collection(exps1) # Preprocess experiences exps0.obs = self.preprocess_obss(exps0.obs, device=self.device) exps1.obs = self.preprocess_obss(exps1.obs, device=self.device) # Log some values keep0 = max(self.log_done_counter0, self.num_procs) keep1 = max(self.log_done_counter1, self.num_procs) log0 = { "return_per_episode": self.log_return0[-keep0:], "reshaped_return_per_episode": self.log_reshaped_return0[-keep0:], "num_frames_per_episode": self.log_num_frames0[-keep0:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter0, } log1 = { "return_per_episode": self.log_return1[-keep1:], "reshaped_return_per_episode": self.log_reshaped_return1[-keep1:], "num_frames_per_episode": self.log_num_frames1[-keep1:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter1, } self.log_done_counter0 = 0 self.log_return0 = self.log_return0[-self.num_procs:] self.log_reshaped_return0 = self.log_reshaped_return0[-self.num_procs:] self.log_num_frames0 = self.log_num_frames0[-self.num_procs:] self.log_done_counter1 = 0 self.log_return1 = self.log_return1[-self.num_procs:] self.log_reshaped_return1 = self.log_reshaped_return1[-self.num_procs:] self.log_num_frames1 = self.log_num_frames1[-self.num_procs:] return exps0, log0, exps1, log1 @abstractmethod def update_parameters(self): pass
def train_model(): # Create command line argument parser parser = init_argparser() opt = parser.parse_args() # Start logger first init_logging(opt.log_level) # validate chosen options opt = validate_options(parser, opt) # Prepare logging and environment envs = [] for i in range(opt.num_processes): env = gym.make(opt.env_name) env.seed(100 * opt.seed + i) envs.append(env) from babyai.rl.utils import ParallelEnv p_envs = ParallelEnv(envs) # Create model name model_name = get_model_name(opt) # Observation preprocessor obss_preprocessor = ObssPreprocessor(model_name, envs[0].observation_space, load_vocab_from=opt.vocab_file, segment_level=opt.segment_level) obss_preprocessor.vocab.save() def reshape_reward(_0, _1, reward, _2): return opt.reward_scale * reward algo = 'ppo' if opt.resume: if opt.reasoning: if opt.diag_targets == 18: model = machine.util.RLCheckpoint.load_partial_model( opt.load_checkpoint) else: model = machine.util.RLCheckpoint.load_partial_model( opt.load_checkpoint, diag_targets=opt.diag_targets, drop_diag=opt.drop_diag) model.detach = opt.detach_hidden else: model = machine.util.RLCheckpoint.load_model(opt.load_checkpoint) model.train() else: if opt.reasoning: if opt.min_model: model = MinModel(obss_preprocessor.obs_space, envs[0].action_space, opt.image_dim, opt.memory_dim, opt.instr_dim) else: model = IACModel(obss_preprocessor.obs_space, envs[0].action_space, opt.image_dim, opt.memory_dim, opt.instr_dim, not opt.no_instr, opt.instr_arch, not opt.no_mem, opt.arch, opt.diag_targets, detach=opt.detach_hidden) else: model = ACModel(obss_preprocessor.obs_space, envs[0].action_space, opt.image_dim, opt.memory_dim, opt.instr_dim, not opt.no_instr, opt.instr_arch, not opt.no_mem, opt.arch) model.train() if torch.cuda.is_available(): model.cuda() trainer = ReinforcementTrainer(p_envs, opt, model, model_name, obss_preprocessor, reshape_reward, algo, opt.reasoning) # Start training trainer.train()
def update_parameters(self): logs = super().update_parameters() '''logs = { "entropy":0,"value":0,"policy_loss":0,"value_loss":0,"grad_norm":0,"loss":0,"return_per_episode": [0],"reshaped_return_per_episode": [0],"num_frames_per_episode": [0],"num_frames": 0,"episodes_done": 0 }''' self.update += 1 if self.version == "v1": if self.update % self.update_frequency == 0 and self.update // self.update_frequency < 15: self.good_start_states = self.update_good_start_states( self.good_start_states, self.random_walk_length, self.transfer_ratio) self.env.update_good_start_states() for state in self.good_start_states[-3:]: s1 = copy.copy(state) s1.render() input() elif self.version == "v2": logger = logging.getLogger(__name__) if self.update % self.update_frequency == 0 and self.update // self.update_frequency < self.curriculum_length: """self.env.print() print(sum([state.count for state in self.env.good_start_states])/len(self.env.good_start_states))""" self.env.update_good_start_states() logger.info('Start state Update Number {}/{}'.format( self.update // self.update_frequency, self.curriculum_length)) if self.update % self.update_frequency == 0 and self.update // self.update_frequency == self.curriculum_length: logger.info('Start State Updates Done') self.env = ParallelEnv( [gym.make(self.env_name) for _ in range(self.n_env)]) elif self.version == "v3": if self.update % self.update_frequency == 0 and not self.curr_really_done: success_rate = np.mean( [1 if r > 0 else 0 for r in logs['return_per_episode']]) self.log_history.append(success_rate) logger = logging.getLogger(__name__) min_delta = 0.025 patience = 1 if self.es_method == 1: bound = 0.9 elif self.es_method == 2: bound = 0.7 + (self.curr_update / self.curriculum_length) * (0.99 - 0.7) if not self.curr_done: #if self.early_stopping_check(patience+(self.curr_update),min_delta): if self.early_stopping_check(self.es_method, bound): self.curr_update += 1 self.log_history = [] self.curr_done = self.env.update_good_start_states() logger.info('Start state Update Number {}'.format( self.curr_update)) else: if self.early_stopping_check(self.es_method, bound): self.curr_update += 1 self.log_history = [] logger.info('Start State Updates Done') self.env = ParallelEnv([ gym.make(self.env_name) for _ in range(self.n_env) ]) self.curr_really_done = True #self.obs = self.env.reset() return logs
class RCPPOAlgo(PPOAlgo): """ The class containing an application of Reverse Curriculum learning from https://arxiv.org/pdf/1707.05300.pdf to Proximal Policy Optimization """ def __init__(self, env_name, n_env, acmodel, demo_loc, version, es_method=2, update_frequency=10, transfer_ratio=0.15, random_walk_length=1, curr_method='one', num_frames_per_proc=None, discount=0.99, lr=7e-4, beta1=0.9, beta2=0.999, gae_lambda=0.95, entropy_coef=0.01, value_loss_coef=0.5, max_grad_norm=0.5, recurrence=4, adam_eps=1e-5, clip_eps=0.2, epochs=4, batch_size=256, preprocess_obss=None, reshape_reward=None, aux_info=None): self.n_env = n_env self.env_name = env_name self.transfer_ratio = transfer_ratio self.random_walk_length = random_walk_length self.version = version self.update_frequency = update_frequency self.es_method = es_method super().__init__([gym.make(env_name) for _ in range(n_env)], acmodel, num_frames_per_proc, discount, lr, beta1, beta2, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, adam_eps, clip_eps, epochs, batch_size, preprocess_obss, reshape_reward, aux_info) if version == "v1": self.good_start_states = self.read_good_start_states( env_name, demo_loc) elif version == "v2" or version == "v3": self.read_good_start_states_v2(env_name, demo_loc, curr_method) self.env = None self.env = RCParallelEnv(self.env_name, self.n_env, demo_loc, curr_method) self.obs = self.env.reset() self.update = 0 self.curr_update = 1 self.log_history = [] self.es_max = -1 self.es_pat = 0 self.curr_done = False self.curr_really_done = False def early_stopping_check(self, method, bound): ''' if len(self.log_history) < patience: return False else: for i in range(patience-1): if self.log_history[-1-i]-self.log_history[-2-i] >= min_delta: return False return True ''' ''' if len(self.log_history) ==0 : return False else: for i in range(patience): if self.log_history[-1-i] >= 0.9: continue else: return False return True ''' if self.log_history[-1] >= bound: return True else: return False ''' if self.log_history[-1] - self.es_max > min_delta: self.es_max = self.log_history[-1] self.es_pat = 0 self.best_weights = self.acmodel.state_dict() ans = False no = 0 else: self.es_pat += 1 if self.es_pat >= patience: self.es_max = -1 self.es_pat = 0 self.acmodel.load_state_dict(self.best_weights) ans = True no = 1 else: ans = False no = 1 #print(ans,no,self.es_pat,patience) return ans ''' def update_parameters(self): logs = super().update_parameters() '''logs = { "entropy":0,"value":0,"policy_loss":0,"value_loss":0,"grad_norm":0,"loss":0,"return_per_episode": [0],"reshaped_return_per_episode": [0],"num_frames_per_episode": [0],"num_frames": 0,"episodes_done": 0 }''' self.update += 1 if self.version == "v1": if self.update % self.update_frequency == 0 and self.update // self.update_frequency < 15: self.good_start_states = self.update_good_start_states( self.good_start_states, self.random_walk_length, self.transfer_ratio) self.env.update_good_start_states() for state in self.good_start_states[-3:]: s1 = copy.copy(state) s1.render() input() elif self.version == "v2": logger = logging.getLogger(__name__) if self.update % self.update_frequency == 0 and self.update // self.update_frequency < self.curriculum_length: """self.env.print() print(sum([state.count for state in self.env.good_start_states])/len(self.env.good_start_states))""" self.env.update_good_start_states() logger.info('Start state Update Number {}/{}'.format( self.update // self.update_frequency, self.curriculum_length)) if self.update % self.update_frequency == 0 and self.update // self.update_frequency == self.curriculum_length: logger.info('Start State Updates Done') self.env = ParallelEnv( [gym.make(self.env_name) for _ in range(self.n_env)]) elif self.version == "v3": if self.update % self.update_frequency == 0 and not self.curr_really_done: success_rate = np.mean( [1 if r > 0 else 0 for r in logs['return_per_episode']]) self.log_history.append(success_rate) logger = logging.getLogger(__name__) min_delta = 0.025 patience = 1 if self.es_method == 1: bound = 0.9 elif self.es_method == 2: bound = 0.7 + (self.curr_update / self.curriculum_length) * (0.99 - 0.7) if not self.curr_done: #if self.early_stopping_check(patience+(self.curr_update),min_delta): if self.early_stopping_check(self.es_method, bound): self.curr_update += 1 self.log_history = [] self.curr_done = self.env.update_good_start_states() logger.info('Start state Update Number {}'.format( self.curr_update)) else: if self.early_stopping_check(self.es_method, bound): self.curr_update += 1 self.log_history = [] logger.info('Start State Updates Done') self.env = ParallelEnv([ gym.make(self.env_name) for _ in range(self.n_env) ]) self.curr_really_done = True #self.obs = self.env.reset() return logs def update_good_start_states(self, good_start_states, random_walk_length, transfer_ratio): new_starts = [] #new_starts.extend(copy.deepcopy(self.good_start_states)) #""" for state in good_start_states: s1 = state for i in range(random_walk_length): s1 = copy.deepcopy(s1) action = s1.action_space.sample() s1.step(action) s1.count += 1 s1.step_count = 0 new_starts.append(s1) """ #n_threads = self.n_env n_threads = 64 for start in range(0,len(self.good_start_states),n_threads): end = min(start+n_threads,len(self.good_start_states)) good_start_states = ParallelEnv(self.good_start_states[start:end]) for i in range(n_explore): action = [good_start_states.action_space.sample() for _ in range(len(good_start_states.envs))] good_start_states.step(action) new_starts.extend(copy.deepcopy(good_start_states.envs)) """ n_old = int(transfer_ratio * len(good_start_states)) l = len(good_start_states) good_start_states = random.sample(good_start_states, n_old) good_start_states.extend(random.sample(new_starts, l - n_old)) return good_start_states def read_good_start_states(self, env_name, demo_loc): demos = babyai.utils.demos.load_demos(demo_loc) seed = 0 start_states = [] for i, demo in enumerate(demos): actions = demo[3] env = gym.make(env_name) babyai.utils.seed(seed) env.seed(seed + i) env.reset() for j in range(len(actions) - 1): _, _, done, _ = env.step(actions[j].value) env.step_count = 0 env.count = 1 start_states.append(env) return start_states[:500] def read_good_start_states_v2(self, env_name, demo_loc, curr_method): demos = babyai.utils.demos.load_demos(demo_loc) seed = 0 max_len = max([len(demo[3]) for demo in demos]) - 1 self.pos = 0 if curr_method == 'log': self.curriculum_length = math.floor(math.log2(max_len)) + 1 else: combining_factor = int(curr_method) self.curriculum_length = math.ceil(max_len / combining_factor) return self.start_states = [[] for _ in range(max_len)] for i, demo in enumerate(demos): actions = demo[3] env = gym.make(env_name) env.seed(seed + i) env.reset() env.count = len(actions) n_steps = len(actions) - 1 for j in range(max_len - 1, n_steps - 1, -1): self.start_states[j].append(copy.deepcopy(env)) for j in range(n_steps): _, _, done, _ = env.step(actions[j].value) env.count -= 1 env.step_count = 0 self.start_states[n_steps - j - 1].append(copy.deepcopy(env)) def update_good_start_states_v2(self): self.pos += 1 new_starts = self.start_states[self.pos] l = len(self.good_start_states) n_old = int(self.transfer_ratio * l) good_start_states = random.sample(self.good_start_states, n_old) good_start_states.extend(random.sample(new_starts, l - n_old)) return good_start_states
class BaseAlgo(ABC): """The base class for RL algorithms.""" def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, aux_info, reward_fn): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future s lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input aux_info : list a list of strings corresponding to the name of the extra information retrieved from the environment for supervised auxiliary losses reward_fn: str [babyai, cpv, both] -- The reward function to use to train the RL agent. """ # Store parameters self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward self.aux_info = aux_info self.reward_fn = reward_fn # Store helpers values self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None]*(shape[0]) self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) if self.aux_info: self.aux_info_collector = ExtraInfoCollector(self.aux_info, shape, self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs # Store reward model if self.reward_fn == 'cpv' or self.reward_fn == 'both': self.reward_model = CPV(primed_model='babyai/rl/algos/models/cpv_model.pth') # Keep track of observations and mission so that we can compute cpv-based reward. self.reset_cpv_buffer() self.all_rewards = [] # For calculating the std and mean of rewards def reset_cpv_buffer(self): self.cpv_buffer = { 'obs': [], 'mission': [], 'prev_reward': numpy.zeros((self.num_procs,)) } def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Cscripts/train_rl.py --env BabyAI-GoToLocal-v0ontains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ # Reset cpv buffer if needed. if self.reward_fn == 'cpv' or self.reward_fn == 'both': self.reset_cpv_buffer() start = time.time() for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) dist = model_results['dist'] value = model_results['value'] memory = model_results['memory'] extra_predictions = model_results['extra_predictions'] action = dist.sample() # Take a step in env and process reward if not using default reward function. obs, old_reward, done, env_info = self.env.step(action.cpu().numpy()) if self.reward_fn == 'cpv' or self.reward_fn == 'both': reward = old_reward # TODO Do we even need this if-else block here anymore? """ unnormalized_reward = self.reward_model.calculate_reward(self.cpv_buffer, self.obs) if self.aux_info: env_info = self.aux_info_collector.process(env_info) env_info = self.process_aux_info(env_info) std = numpy.std(self.all_rewards) if self.all_rewards != [] else numpy.std(unnormalized_reward) mean = numpy.mean(self.all_rewards) if self.all_rewards != [] else numpy.mean(unnormalized_reward) reward = numpy.clip([(r - mean) / std for r in unnormalized_reward], 0, 1) self.all_rewards.extend(unnormalized_reward) if len(self.all_rewards) > 1000: self.all_rewards[-1000:] """ elif self.reward_fn == 'babyai': reward = old_reward if self.aux_info: env_info = self.aux_info_collector.process(env_info) #env_info = self.process_aux_info(env_info) # Update experiences values self.obss[i] = self.obs self.obs = obs self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip(obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) if self.aux_info: self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item()) self.log_num_frames.append(self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # If CPV, recompute reward based on trajectory. if self.reward_fn == 'cpv': # Make single run through CPV model to compute all rewards at once. self.rewards = self.reward_model.calculate_reward(self.obss).permute(1,0) # TODO normalize rewards? std, mean = torch.std_mean(self.rewards, dim=1) std = std.view(-1, 1).expand_as(self.rewards) mean = mean.view(-1, 1).expand_as(self.rewards) self.reward = torch.clamp((self.rewards - mean) / std, 0.0, 1.0) # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value'] for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0 # print("Reward") # print(self.rewards[i]) # # print("Discount") # # print(self.discount[i]) # print("Values") # print(self.values[i]) delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i] self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps = DictList() exps.obs = [self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc)] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) if self.aux_info: exps = self.aux_info_collector.end_collection(exps) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter, } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log @abstractmethod def update_parameters(self): pass