def run_batch(self, train_summary=False): """Collect trajectories for a single batch and train (if self.train). Args: train_summary: return a Summary of the training step (losses, etc.). Returns: result: None (if not self.train) or the return value of agent.train. """ last_obs = self.last_obs shapes = (self.n_steps, self.envs.n_envs) values = np.zeros(shapes, dtype=np.float32) rewards = np.zeros(shapes, dtype=np.float32) dones = np.zeros(shapes, dtype=np.float32) all_obs, all_actions = [], [] mb_states = self.states # save the initial states at the beginning of each mb for later training. for n in range(self.n_steps): actions, values[n, :], states = self.agent.step( last_obs, self.states) actions = mask_unused_argument_samples(actions) all_obs.append(last_obs) all_actions.append(actions) pysc2_actions = actions_to_pysc2( actions, size=last_obs['screen'].shape[1:3]) obs_raw = self.envs.step(pysc2_actions) last_obs = self.preproc.preprocess_obs(obs_raw) rewards[n, :], dones[n, :] = zip(*[(t.reward, t.last()) for t in obs_raw]) self.states = states for t in obs_raw: if t.last(): self.cumulative_score += self._summarize_episode(t) next_values = self.agent.get_value(last_obs, states) returns, advs = compute_returns_and_advs(rewards, dones, values, next_values, self.discount) actions = stack_and_flatten_actions(all_actions) obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs)) returns = flatten_first_dims(returns) advs = flatten_first_dims(advs) self.last_obs = last_obs if self.train: return self.agent.train(obs, mb_states, actions, returns, advs, summary=train_summary) else: return None
def stack_and_flatten_actions(lst, axis=0): fn_id_list, arg_dict_list = zip(*lst) fn_id = np.stack(fn_id_list, axis=axis) fn_id = flatten_first_dims(fn_id) arg_ids = stack_ndarray_dicts(arg_dict_list, axis=axis) arg_ids = flatten_first_dims_dict(arg_ids) return (fn_id, arg_ids)
def run_batch(self, train_summary): last_obs = self.last_obs shapes = (self.n_steps, self.envs.n_envs) values = np.zeros(np.concatenate([[2], shapes]), dtype=np.float32) #first dim: manager values, second dim: worker values rewards = np.zeros(shapes, dtype=np.float32) dones = np.zeros(shapes, dtype=np.float32) all_obs, all_actions = [], [] mb_states = self.states #first dim: manager values, second dim: worker values s = np.zeros((self.n_steps, self.envs.n_envs, self.d), dtype=np.float32) mb_last_c_goals = np.zeros((self.n_steps, self.envs.n_envs, self.c, self.d), dtype=np.float32) mb_last_mo = np.zeros((self.n_steps, self.envs.n_envs, self.c, self.d), dtype=np.float32) for n in range(self.n_steps): actions, values[:,n,:], states, s[n,:,:], self.last_c_goals, self.lc_manager_outputs = self.agent.step(last_obs, self.states, self.last_c_goals, self.lc_manager_outputs) actions = mask_unused_argument_samples(actions) all_obs.append(last_obs) all_actions.append(actions) mb_last_c_goals[n,:,:,:] = self.last_c_goals mb_last_mo[n,:,:,:] = self.lc_manager_outputs pysc2_actions = actions_to_pysc2(actions, size=last_obs['screen'].shape[1:3]) obs_raw = self.envs.step(pysc2_actions) last_obs = self.preproc.preprocess_obs(obs_raw) rewards[n,:], dones[n,:] = zip(*[(t.reward,t.last()) for t in obs_raw]) self.states = states for t in obs_raw: if t.last(): self.cumulative_score += self._summarize_episode(t) returns, returns_intr, adv_m, adv_w = compute_returns_and_advantages( rewards, dones, values, s, mb_last_c_goals[:,:,-1,:], self.discount, self.T, self.envs.n_envs, self.c ) s_diff = compute_sdiff(s, self.c, self.T, self.envs.n_envs, self.d) # last_c_goals = compute_last_c_goals(goals, self.envs.n_envs, self.T, self.c, self.d) actions = stack_and_flatten_actions(all_actions[self.c:self.c+self.T]) obs = stack_ndarray_dicts(all_obs) obs = { k:obs[k][self.c:self.c+self.T] for k in obs } obs = flatten_first_dims_dict(obs) returns = flatten_first_dims(returns) returns_intr = flatten_first_dims(returns_intr) adv_m = flatten_first_dims(adv_m) adv_w = flatten_first_dims(adv_w) s_diff = flatten_first_dims(s_diff) mb_last_c_goals = flatten_first_dims(mb_last_c_goals[self.c:self.c+self.T]) prep_lc_mo = flatten_first_dims(mb_last_mo[self.c:self.c+self.T]) self.last_obs = last_obs if self.train: return self.agent.train( obs, mb_states, actions, returns, returns_intr, adv_m, adv_w, s_diff, mb_last_c_goals, prep_lc_mo, summary=train_summary ) else: return None
def run_batch(self, train_summary=False): """Collect trajectories for a single batch and train (if self.train). Args: train_summary: return a Summary of the training step (losses, etc.). Returns: result: None (if not self.train) or the return value of agent.train. """ nbatch = self.envs.n_envs*self.n_steps assert nbatch % self.nminibatches == 0 nbatch_train = nbatch // self.nminibatches last_obs = self.last_obs shapes = (self.n_steps, self.envs.n_envs) values = np.zeros(shapes, dtype=np.float32) rewards = np.zeros(shapes, dtype=np.float32) dones = np.zeros(shapes, dtype=np.float32) all_actions, all_obs = [], [] mb_states = self.states # save the initial states at the beginning of each mb for later training. for n in range(self.n_steps): actions, values[n,:], states = self.agent.step(last_obs, self.states) # TODO: would be better if we could get the logprobs here instead of having to calls get_log_probs later. actions = mask_unused_argument_samples(actions) all_obs.append(last_obs) all_actions.append(actions) pysc2_actions = actions_to_pysc2(actions, size=last_obs['screen'].shape[1:3]) obs_raw = self.envs.step(pysc2_actions) last_obs = self.preproc.preprocess_obs(obs_raw) rewards[n,:], dones[n,:] = zip(*[(t.reward,t.last()) for t in obs_raw]) self.states = states for t in obs_raw: if t.last(): self.cumulative_score += self._summarize_episode(t) next_values = self.agent.get_value(last_obs, states) returns, advs = compute_returns_and_advs(rewards, dones, values, next_values, self.discount) actions = stack_and_flatten_actions(all_actions) obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs)) returns = flatten_first_dims(returns) advs = flatten_first_dims(advs) values = flatten_first_dims(values) self.last_obs = last_obs if self.train: mbloss = [] old_log_probs = self.agent.get_log_probs(obs, self.states, actions) if self.states is None: # print('train') inds = np.arange(nbatch) for i in range(self.noptepochs): # print(f'opt {i}') np.random.shuffle(inds) # print('inds', inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train # print(f'mb {start}:{end}') mbinds = inds[start:end] mb_obs = { k:obs[k][mbinds] for k in obs } mb_actions = ( actions[0][mbinds], { arg_id : actions[1][arg_id][mbinds] for arg_id in actions[1] } ) # print(returns.shape, advs.shape, old_log_probs.shape) # print(old_log_probs) mbinputs = [a[mbinds] for a in (returns, advs, old_log_probs, values)] _step, _loss, _summary = self.agent.train(mb_obs, mb_states, mb_actions, *mbinputs, summary=train_summary) # print(f'loss {_step}:{_loss}') mbloss.append(_loss) return _step, np.mean(mbloss), _summary else: raise NotImplementedError('No recurrent policy for PPO yet.') else: return None