def stack_and_flatten_actions(lst, axis=0): fn_id_list, arg_dict_list = zip(*lst) fn_id = np.stack(fn_id_list, axis=axis) fn_id = flatten_first_dims(fn_id) arg_ids = stack_ndarray_dicts(arg_dict_list, axis=axis) arg_ids = flatten_first_dims_dict(arg_ids) return (fn_id, arg_ids)
def run_batch(self, train_summary=False): """Collect trajectories for a single batch and train (if self.train). Args: train_summary: return a Summary of the training step (losses, etc.). Returns: result: None (if not self.train) or the return value of agent.train. """ shapes = (self.n_steps, self.envs.n_envs) values = np.zeros(shapes, dtype=np.float32) rewards = np.zeros(shapes, dtype=np.float32) dones = np.zeros(shapes, dtype=np.float32) all_obs = [] all_actions = [] all_scores = [] last_obs = self.last_obs for n in range(self.n_steps): actions, value_estimate = self.agent.step(last_obs) actions = mask_unused_argument_samples(actions) size = last_obs['screen'].shape[1:3] values[n, :] = value_estimate all_obs.append(last_obs) all_actions.append(actions) pysc2_actions = actions_to_pysc2(actions, size) obs_raw = self.envs.step(pysc2_actions) last_obs = self.preproc.preprocess_obs(obs_raw) rewards[n, :] = [t.reward for t in obs_raw] dones[n, :] = [t.last() for t in obs_raw] for t in obs_raw: if t.last(): score = self._summarize_episode(t) self.cumulative_score += score self.last_obs = last_obs next_values = self.agent.get_value(last_obs) returns, advs = compute_returns_advantages(rewards, dones, values, next_values, self.discount) actions = stack_and_flatten_actions(all_actions) obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs)) returns = flatten_first_dims(returns) advs = flatten_first_dims(advs) if self.train: return self.agent.train(obs, actions, returns, advs, summary=train_summary) return None
def run_batch(self, total_frames, train_summary=False, lstm=False): """Collect trajectories for a single batch and train (if self.train). Args: train_summary: return a Summary of the training step (losses, etc.). Returns: result: None (if not self.train) or the return value of agent.train. """ shapes = (self.n_steps, self.envs.n_envs) values = np.zeros(shapes, dtype=np.float32) rewards = np.zeros(shapes, dtype=np.float32) dones = np.zeros(shapes, dtype=np.float32) all_obs = [] all_actions = [] all_scores = [] # TODO: Unused local var? last_obs = self.last_obs lstm_states = self.lstm_states if lstm else None # XXX reset? for n in range(self.n_steps): actions, value_estimate, lstm_states = self.agent.step( last_obs, lstm_states) actions, masked_actions = mask_unavailable_samples( actions, last_obs) actions = mask_unused_argument_samples(actions) size = last_obs['screen'].shape[1:3] values[n, :] = value_estimate all_obs.append(last_obs) all_actions.append(actions) pysc2_actions = actions_to_pysc2(masked_actions, size) # XXX Use masked samples obs_raw = self.envs.step(pysc2_actions) last_obs = self.preproc.preprocess_obs(obs_raw) rewards[n, :] = [t.reward for t in obs_raw] dones[n, :] = [t.last() for t in obs_raw] # episode summary for i, t in enumerate(obs_raw): if t.last(): score = self._summarize_episode(t, total_frames, worker_id=i) self.cumulative_score += score self.mean_score += score self.episode_last[i] = t.last() # mean and best scores summary if all(self.episode_last): self._summarize_best_and_mean(total_frames) total_frames += 1 self.last_obs = last_obs self.lstm_states = lstm_states next_values = self.agent.get_value(last_obs, lstm_states) returns, advs = compute_returns_advantages(rewards, dones, values, next_values, self.discount) actions = stack_and_flatten_actions(all_actions) obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs)) returns = flatten_first_dims(returns) advs = flatten_first_dims(advs) if self.train: return self.agent.train(obs, actions, returns, advs, total_frames, summary=train_summary, lstm_states=lstm_states) return None, total_frames