示例#1
0
def stack_and_flatten_actions(lst, axis=0):
    fn_id_list, arg_dict_list = zip(*lst)
    fn_id = np.stack(fn_id_list, axis=axis)
    fn_id = flatten_first_dims(fn_id)
    arg_ids = stack_ndarray_dicts(arg_dict_list, axis=axis)
    arg_ids = flatten_first_dims_dict(arg_ids)
    return (fn_id, arg_ids)
示例#2
0
    def run_batch(self, train_summary=False):
        """Collect trajectories for a single batch and train (if self.train).

    Args:
      train_summary: return a Summary of the training step (losses, etc.).

    Returns:
      result: None (if not self.train) or the return value of agent.train.
    """
        shapes = (self.n_steps, self.envs.n_envs)
        values = np.zeros(shapes, dtype=np.float32)
        rewards = np.zeros(shapes, dtype=np.float32)
        dones = np.zeros(shapes, dtype=np.float32)
        all_obs = []
        all_actions = []
        all_scores = []

        last_obs = self.last_obs

        for n in range(self.n_steps):
            actions, value_estimate = self.agent.step(last_obs)
            actions = mask_unused_argument_samples(actions)
            size = last_obs['screen'].shape[1:3]

            values[n, :] = value_estimate
            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(actions, size)
            obs_raw = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n, :] = [t.reward for t in obs_raw]
            dones[n, :] = [t.last() for t in obs_raw]

            for t in obs_raw:
                if t.last():
                    score = self._summarize_episode(t)
                    self.cumulative_score += score

        self.last_obs = last_obs

        next_values = self.agent.get_value(last_obs)

        returns, advs = compute_returns_advantages(rewards, dones, values,
                                                   next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs = flatten_first_dims(advs)

        if self.train:
            return self.agent.train(obs,
                                    actions,
                                    returns,
                                    advs,
                                    summary=train_summary)

        return None
示例#3
0
    def run_batch(self, total_frames, train_summary=False, lstm=False):
        """Collect trajectories for a single batch and train (if self.train).

    Args:
      train_summary: return a Summary of the training step (losses, etc.).

    Returns:
      result: None (if not self.train) or the return value of agent.train.
    """
        shapes = (self.n_steps, self.envs.n_envs)
        values = np.zeros(shapes, dtype=np.float32)
        rewards = np.zeros(shapes, dtype=np.float32)
        dones = np.zeros(shapes, dtype=np.float32)
        all_obs = []
        all_actions = []
        all_scores = []  # TODO: Unused local var?

        last_obs = self.last_obs
        lstm_states = self.lstm_states if lstm else None  # XXX reset?

        for n in range(self.n_steps):
            actions, value_estimate, lstm_states = self.agent.step(
                last_obs, lstm_states)
            actions, masked_actions = mask_unavailable_samples(
                actions, last_obs)
            actions = mask_unused_argument_samples(actions)
            size = last_obs['screen'].shape[1:3]

            values[n, :] = value_estimate
            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(masked_actions,
                                             size)  # XXX Use masked samples
            obs_raw = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n, :] = [t.reward for t in obs_raw]
            dones[n, :] = [t.last() for t in obs_raw]

            # episode summary
            for i, t in enumerate(obs_raw):
                if t.last():
                    score = self._summarize_episode(t,
                                                    total_frames,
                                                    worker_id=i)
                    self.cumulative_score += score
                    self.mean_score += score
                    self.episode_last[i] = t.last()

            # mean and best scores summary
            if all(self.episode_last):
                self._summarize_best_and_mean(total_frames)
            total_frames += 1

        self.last_obs = last_obs
        self.lstm_states = lstm_states
        next_values = self.agent.get_value(last_obs, lstm_states)

        returns, advs = compute_returns_advantages(rewards, dones, values,
                                                   next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs = flatten_first_dims(advs)

        if self.train:
            return self.agent.train(obs,
                                    actions,
                                    returns,
                                    advs,
                                    total_frames,
                                    summary=train_summary,
                                    lstm_states=lstm_states)

        return None, total_frames