def __init__(self, base_folder: str, models_folder_name: str, load_pi_predict_model: bool, load_v_predict_model: bool, save_episodes: bool, save_episodes_folder: str, save_gifs: bool, gifs_folder_name: str, save_pi_predict_models: bool, save_v_predict_models: bool, run_indefinitely: bool, max_nb_episodes: int, use_keras_gym_train_monitor: bool): """It makes little sense to have both save_episodes and save_gifs set to True, since episodes can be watched (in better quality, even though the files are smaller) using WatchReplay.py.""" self.base_folder = base_folder self.models_folder_name = models_folder_name self.save_episodes = save_episodes self.save_episodes_folder = save_episodes_folder self.should_save_gifs = save_gifs self.gifs_folder_name = gifs_folder_name self.should_save_pi_predict_models = save_pi_predict_models self.should_save_v_predict_models = save_v_predict_models self.run_indefinitely = run_indefinitely self.max_nb_episodes = max_nb_episodes self.use_keras_gym_train_monitor = use_keras_gym_train_monitor self.models_folder = os.path.join(self.base_folder, self.models_folder_name) self.gifs_folder = os.path.join(self.base_folder, self.gifs_folder_name) if save_episodes_folder and not os.path.exists(save_episodes_folder): os.makedirs(save_episodes_folder) if models_folder_name and not os.path.exists(self.models_folder): os.makedirs(self.models_folder) if save_gifs and self.gifs_folder and not os.path.exists(self.gifs_folder): os.makedirs(self.gifs_folder) self.env = gym.make('Riverraid-v0') self.env = km.wrappers.ImagePreprocessor(self.env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH, grayscale=RL_PREPROCESS_GRAYSCALE) self.env = km.wrappers.FrameStacker(self.env, num_frames=RL_PREPROCESS_NUM_FRAMES) if use_keras_gym_train_monitor: self.env = km.wrappers.TrainMonitor(self.env) # show logs from TrainMonitor km.enable_logging() # function approximators self.func = km.predefined.AtariFunctionApproximator(self.env) self.pi = km.SoftmaxPolicy(self.func, update_strategy=RL_PI_UPDATE_STRATEGY) # PPO self.v = km.V(self.func, gamma=RLTrainer.GAMMA, bootstrap_with_target_model=RLTrainer.BOOTSTRAP_WITH_TARGET_MODEL, bootstrap_n=RLTrainer.BOOTSTRAP_N) self.actor_critic = km.ActorCritic(self.pi, self.v) # we'll use this to temporarily store our experience self.buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=self.v, capacity=RLTrainer.BUFFER_CAPACITY, batch_size=RLTrainer.BUFFER_BATCH_SIZE) if load_pi_predict_model: self.load_pi_predict_model_weights() if load_v_predict_model: self.load_v_predict_model_weights()
def test_atari_ppo(): # env with preprocessing env = gym.make('PongDeterministic-v4') env = km.wrappers.ImagePreprocessor(env, height=105, width=80, grayscale=True) env = km.wrappers.FrameStacker(env, num_frames=3) env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() func = Func(env, lr=0.00025) pi = km.SoftmaxPolicy(function_approximator=func, update_strategy='ppo') v = km.V(function_approximator=func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True) actor_critic = km.ActorCritic(pi, v) # we'll use this to temporarily store our experience buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=v, capacity=256, batch_size=64) # run episodes while env.T < 500000: s = env.reset() for t in range(env.spec.max_episode_steps): a = pi(s, use_target_model=True) # target_model == pi_old s_next, r, done, info = env.step(a) buffer.add(s, a, r, done, env.ep) if len(buffer) >= buffer.capacity: # use 4 epochs per round num_batches = int(4 * buffer.capacity / buffer.batch_size) for _ in range(num_batches): actor_critic.batch_update(*buffer.sample()) buffer.clear() # soft update (tau=1 would be a hard update) actor_critic.sync_target_model(tau=0.1) if done: break s = s_next if env.G > 0: break assert env.T < 500000, "test_atari_ppo didn't converge"
import gym import keras_gym as km from tensorflow.keras.layers import Conv2D, Lambda, Dense, Flatten from tensorflow.keras import backend as K # env with preprocessing env = gym.make('PongDeterministic-v4') env = km.wrappers.ImagePreprocessor(env, height=105, width=80, grayscale=True) env = km.wrappers.FrameStacker(env, num_frames=3) env = km.wrappers.TrainMonitor(env, tensorboard_dir='data/sac/tensorboard') # show logs from TrainMonitor km.enable_logging() class Func(km.FunctionApproximator): def body(self, S): def diff_transform(S): S = K.cast(S, 'float32') / 255 M = km.utils.diff_transform_matrix(num_frames=3) return K.dot(S, M) X = Lambda(diff_transform)(S) X = Conv2D(filters=16, kernel_size=8, strides=4, activation='relu')(X) X = Conv2D(filters=32, kernel_size=4, strides=2, activation='relu')(X) X = Flatten()(X) X = Dense(units=256, activation='relu')(X) return X # function approximators