Пример #1
0
    def run_n_steps(self, num_steps, max_env=None):
        import tensorflow as tf

        if max_env is not None:
            self.env.__num_steps = max_env

        self.reset_data()
        state = self.env.reset()
        step = 0

        while step < num_steps:
            done = False
            new_state = self.env.reset()
            episode_rewards = []
            while not done:
                state = new_state
                agent_out = self.agent.act_experience(
                    np.expand_dims(state, axis=0), self.return_log_prob)
                # S
                self.data_agg["state"].append(state)
                if self.return_feature_state:
                    self.data_agg["feature_state"].append(
                        self.agent.get_state())
                # A
                action = agent_out["action"]
                if tf.is_tensor(action):
                    action = action.numpy()
                if self.discrete_env:
                    action = int(action)
                elif action.shape == ():
                    action = np.expand_dims(action, 0)
                new_state, reward, done, _ = self.env.step(action)
                self.data_agg["action"].append(action)
                # R
                self.data_agg["reward"].append(reward)
                episode_rewards.append(reward)
                # S+1
                self.data_agg["state_new"].append(new_state)
                # info on terminal state
                self.data_agg["not_done"].append(float(int(not (done))))

                # append optional in time values to data
                if self.return_log_prob:
                    self.data_agg["log_prob"].append(
                        agent_out["log_probability"])
                if self.return_value_estimate:
                    self.data_agg["value_estimate"].append(
                        agent_out["value_estimate"])

                step += 1
                if step == num_steps:
                    break

            if self.return_monte_carlo:
                self.data_agg["monte_carlo"].extend(
                    discount_cumsum(episode_rewards, self.gamma))

        return self.data_agg, self.runner_position
Пример #2
0
    def run_n_episodes(self, num_episodes, max_env=None):
        import tensorflow as tf

        if max_env is not None:
            self.env.__num_steps = max_env

        state = self.env.reset()
        for e in range(num_episodes):
            done = False
            new_state = self.env.reset()
            while not done:
                state = new_state
                agent_out = self.agent.act_experience(
                    np.expand_dims(state, axis=0), self.return_log_prob)

                # S
                self.data_agg["state"].append(state)
                # A
                action = agent_out["action"]
                if tf.is_tensor(action):
                    action = action.numpy()
                # A
                if self.discrete_env:
                    action = int(action)
                new_state, reward, done, info = self.env.step(action)
                self.data_agg["action"].append(action)
                # R
                self.data_agg["reward"].append(reward)
                # S+1
                self.data_agg["state_new"].append(new_state)
                # info on terminal state
                self.data_agg["not_done"].append(int(not (done)))

                # append optional in time values to data data
                if self.return_log_prob:
                    self.data_agg["log_prob"].append(
                        agent_out["log_probability"])
                if self.return_value_estimate:
                    self.data_agg["value_estimate"].append(
                        agent_out["value_estimate"])

        if self.return_monte_carlo:
            self.data_agg["monte_carlo"] = discount_cumsum(
                self.data_agg["reward"], self.gamma)

        return self.data_agg, self.runner_position