def __init__(self, base_folder: str, models_folder_name: str, load_pi_predict_model: bool, load_v_predict_model: bool, save_episodes: bool, save_episodes_folder: str, save_gifs: bool, gifs_folder_name: str, save_pi_predict_models: bool, save_v_predict_models: bool, run_indefinitely: bool, max_nb_episodes: int, use_keras_gym_train_monitor: bool): """It makes little sense to have both save_episodes and save_gifs set to True, since episodes can be watched (in better quality, even though the files are smaller) using WatchReplay.py.""" self.base_folder = base_folder self.models_folder_name = models_folder_name self.save_episodes = save_episodes self.save_episodes_folder = save_episodes_folder self.should_save_gifs = save_gifs self.gifs_folder_name = gifs_folder_name self.should_save_pi_predict_models = save_pi_predict_models self.should_save_v_predict_models = save_v_predict_models self.run_indefinitely = run_indefinitely self.max_nb_episodes = max_nb_episodes self.use_keras_gym_train_monitor = use_keras_gym_train_monitor self.models_folder = os.path.join(self.base_folder, self.models_folder_name) self.gifs_folder = os.path.join(self.base_folder, self.gifs_folder_name) if save_episodes_folder and not os.path.exists(save_episodes_folder): os.makedirs(save_episodes_folder) if models_folder_name and not os.path.exists(self.models_folder): os.makedirs(self.models_folder) if save_gifs and self.gifs_folder and not os.path.exists(self.gifs_folder): os.makedirs(self.gifs_folder) self.env = gym.make('Riverraid-v0') self.env = km.wrappers.ImagePreprocessor(self.env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH, grayscale=RL_PREPROCESS_GRAYSCALE) self.env = km.wrappers.FrameStacker(self.env, num_frames=RL_PREPROCESS_NUM_FRAMES) if use_keras_gym_train_monitor: self.env = km.wrappers.TrainMonitor(self.env) # show logs from TrainMonitor km.enable_logging() # function approximators self.func = km.predefined.AtariFunctionApproximator(self.env) self.pi = km.SoftmaxPolicy(self.func, update_strategy=RL_PI_UPDATE_STRATEGY) # PPO self.v = km.V(self.func, gamma=RLTrainer.GAMMA, bootstrap_with_target_model=RLTrainer.BOOTSTRAP_WITH_TARGET_MODEL, bootstrap_n=RLTrainer.BOOTSTRAP_N) self.actor_critic = km.ActorCritic(self.pi, self.v) # we'll use this to temporarily store our experience self.buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=self.v, capacity=RLTrainer.BUFFER_CAPACITY, batch_size=RLTrainer.BUFFER_BATCH_SIZE) if load_pi_predict_model: self.load_pi_predict_model_weights() if load_v_predict_model: self.load_v_predict_model_weights()
def test_atari_ppo(): # env with preprocessing env = gym.make('PongDeterministic-v4') env = km.wrappers.ImagePreprocessor(env, height=105, width=80, grayscale=True) env = km.wrappers.FrameStacker(env, num_frames=3) env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() func = Func(env, lr=0.00025) pi = km.SoftmaxPolicy(function_approximator=func, update_strategy='ppo') v = km.V(function_approximator=func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True) actor_critic = km.ActorCritic(pi, v) # we'll use this to temporarily store our experience buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=v, capacity=256, batch_size=64) # run episodes while env.T < 500000: s = env.reset() for t in range(env.spec.max_episode_steps): a = pi(s, use_target_model=True) # target_model == pi_old s_next, r, done, info = env.step(a) buffer.add(s, a, r, done, env.ep) if len(buffer) >= buffer.capacity: # use 4 epochs per round num_batches = int(4 * buffer.capacity / buffer.batch_size) for _ in range(num_batches): actor_critic.batch_update(*buffer.sample()) buffer.clear() # soft update (tau=1 would be a hard update) actor_critic.sync_target_model(tau=0.1) if done: break s = s_next if env.G > 0: break assert env.T < 500000, "test_atari_ppo didn't converge"
############################################################################### class MLP(km.FunctionApproximator): def body(self, X): X = keras.layers.Lambda( lambda x: K.concatenate([x, K.square(x)], axis=1))(X) X = keras.layers.Dense(units=6, activation='tanh')(X) X = keras.layers.Dense(units=6, activation='tanh')(X) return X mlp = MLP(env, lr=1e-3) pi = km.GaussianPolicy(mlp, update_strategy='ppo') v = km.V(mlp, gamma=0.9, bootstrap_n=5) ac = km.ActorCritic(pi, v) buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=v, capacity=512, batch_size=32) ############################################################################### # run ############################################################################### while env.T < 1000000: s = env.reset() for t in range(env.spec.max_episode_steps): a = pi(s, use_target_model=True) s_next, r, done, info = env.step(a) buffer.add(s, a, r, done, env.ep)
'''loads a trained model from path''' return load_model(path) if __name__ == "__main__": """agent=REINFORCE(env) agent.train(100) import matplotlib.pyplot as plt import math""" env = KSPPilot() function_approximator = MLP(env, lr=0.1) pi = km.SoftmaxPolicy(function_approximator, update_strategy='vanilla') v = km.V(function_approximator, gamma=0.9, bootstrap_n=1) # combine them into a single actor-critic actor_critic = km.ActorCritic(pi, v) for ep in range(100): s = env.reset() for t in range(10000): a = pi(s, use_target_model=True) s_next, r, done, info = env.step(a) # small incentive to keep moving if np.array_equal(s_next, s): r = -0.1 actor_critic.update(s, a, r, done) if t % 2 == 0: pi.sync_target_model(tau=1.0)