Пример #1
0
    def __init__(self, base_folder: str, models_folder_name: str,
                 load_pi_predict_model: bool, load_v_predict_model: bool,
                 save_episodes: bool, save_episodes_folder: str,
                 save_gifs: bool, gifs_folder_name: str,
                 save_pi_predict_models: bool, save_v_predict_models: bool,
                 run_indefinitely: bool, max_nb_episodes: int,
                 use_keras_gym_train_monitor: bool):
        """It makes little sense to have both save_episodes and save_gifs set to True, since episodes can be watched
        (in better quality, even though the files are smaller) using WatchReplay.py."""

        self.base_folder = base_folder
        self.models_folder_name = models_folder_name
        self.save_episodes = save_episodes
        self.save_episodes_folder = save_episodes_folder
        self.should_save_gifs = save_gifs
        self.gifs_folder_name = gifs_folder_name
        self.should_save_pi_predict_models = save_pi_predict_models
        self.should_save_v_predict_models = save_v_predict_models
        self.run_indefinitely = run_indefinitely
        self.max_nb_episodes = max_nb_episodes
        self.use_keras_gym_train_monitor = use_keras_gym_train_monitor

        self.models_folder = os.path.join(self.base_folder, self.models_folder_name)
        self.gifs_folder = os.path.join(self.base_folder, self.gifs_folder_name)

        if save_episodes_folder and not os.path.exists(save_episodes_folder):
            os.makedirs(save_episodes_folder)
        if models_folder_name and not os.path.exists(self.models_folder):
            os.makedirs(self.models_folder)
        if save_gifs and self.gifs_folder and not os.path.exists(self.gifs_folder):
            os.makedirs(self.gifs_folder)

        self.env = gym.make('Riverraid-v0')
        self.env = km.wrappers.ImagePreprocessor(self.env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH,
                                                 grayscale=RL_PREPROCESS_GRAYSCALE)
        self.env = km.wrappers.FrameStacker(self.env, num_frames=RL_PREPROCESS_NUM_FRAMES)
        if use_keras_gym_train_monitor:
            self.env = km.wrappers.TrainMonitor(self.env)

        # show logs from TrainMonitor
        km.enable_logging()

        # function approximators
        self.func = km.predefined.AtariFunctionApproximator(self.env)
        self.pi = km.SoftmaxPolicy(self.func, update_strategy=RL_PI_UPDATE_STRATEGY)  # PPO

        self.v = km.V(self.func, gamma=RLTrainer.GAMMA,
                      bootstrap_with_target_model=RLTrainer.BOOTSTRAP_WITH_TARGET_MODEL,
                      bootstrap_n=RLTrainer.BOOTSTRAP_N)

        self.actor_critic = km.ActorCritic(self.pi, self.v)

        # we'll use this to temporarily store our experience
        self.buffer = km.caching.ExperienceReplayBuffer.from_value_function(
            value_function=self.v, capacity=RLTrainer.BUFFER_CAPACITY, batch_size=RLTrainer.BUFFER_BATCH_SIZE)

        if load_pi_predict_model:
            self.load_pi_predict_model_weights()
        if load_v_predict_model:
            self.load_v_predict_model_weights()
Пример #2
0
def test_atari_ppo():
    # env with preprocessing
    env = gym.make('PongDeterministic-v4')
    env = km.wrappers.ImagePreprocessor(env,
                                        height=105,
                                        width=80,
                                        grayscale=True)
    env = km.wrappers.FrameStacker(env, num_frames=3)
    env = km.wrappers.TrainMonitor(env)

    # show logs from TrainMonitor
    km.enable_logging()

    func = Func(env, lr=0.00025)
    pi = km.SoftmaxPolicy(function_approximator=func, update_strategy='ppo')
    v = km.V(function_approximator=func,
             gamma=0.99,
             bootstrap_n=10,
             bootstrap_with_target_model=True)
    actor_critic = km.ActorCritic(pi, v)

    # we'll use this to temporarily store our experience
    buffer = km.caching.ExperienceReplayBuffer.from_value_function(
        value_function=v, capacity=256, batch_size=64)

    # run episodes
    while env.T < 500000:
        s = env.reset()

        for t in range(env.spec.max_episode_steps):
            a = pi(s, use_target_model=True)  # target_model == pi_old
            s_next, r, done, info = env.step(a)

            buffer.add(s, a, r, done, env.ep)

            if len(buffer) >= buffer.capacity:
                # use 4 epochs per round
                num_batches = int(4 * buffer.capacity / buffer.batch_size)
                for _ in range(num_batches):
                    actor_critic.batch_update(*buffer.sample())
                buffer.clear()

                # soft update (tau=1 would be a hard update)
                actor_critic.sync_target_model(tau=0.1)

            if done:
                break

            s = s_next

        if env.G > 0:
            break

    assert env.T < 500000, "test_atari_ppo didn't converge"
Пример #3
0
###############################################################################


class MLP(km.FunctionApproximator):
    def body(self, X):
        X = keras.layers.Lambda(
            lambda x: K.concatenate([x, K.square(x)], axis=1))(X)
        X = keras.layers.Dense(units=6, activation='tanh')(X)
        X = keras.layers.Dense(units=6, activation='tanh')(X)
        return X


mlp = MLP(env, lr=1e-3)
pi = km.GaussianPolicy(mlp, update_strategy='ppo')
v = km.V(mlp, gamma=0.9, bootstrap_n=5)
ac = km.ActorCritic(pi, v)

buffer = km.caching.ExperienceReplayBuffer.from_value_function(
    value_function=v, capacity=512, batch_size=32)

###############################################################################
# run
###############################################################################

while env.T < 1000000:
    s = env.reset()
    for t in range(env.spec.max_episode_steps):
        a = pi(s, use_target_model=True)
        s_next, r, done, info = env.step(a)

        buffer.add(s, a, r, done, env.ep)
Пример #4
0
        '''loads a trained model from path'''
        return load_model(path)


if __name__ == "__main__":
    """agent=REINFORCE(env)
    agent.train(100)
    import matplotlib.pyplot as plt
    import math"""

    env = KSPPilot()
    function_approximator = MLP(env, lr=0.1)
    pi = km.SoftmaxPolicy(function_approximator, update_strategy='vanilla')
    v = km.V(function_approximator, gamma=0.9, bootstrap_n=1)
    # combine them into a single actor-critic
    actor_critic = km.ActorCritic(pi, v)
    for ep in range(100):
        s = env.reset()

        for t in range(10000):
            a = pi(s, use_target_model=True)
            s_next, r, done, info = env.step(a)

            # small incentive to keep moving
            if np.array_equal(s_next, s):
                r = -0.1

            actor_critic.update(s, a, r, done)

            if t % 2 == 0:
                pi.sync_target_model(tau=1.0)