def __init__(self, base_folder: str, models_folder_name: str, load_pi_predict_model: bool, load_v_predict_model: bool, save_episodes: bool, save_episodes_folder: str, save_gifs: bool, gifs_folder_name: str, save_pi_predict_models: bool, save_v_predict_models: bool, run_indefinitely: bool, max_nb_episodes: int, use_keras_gym_train_monitor: bool): """It makes little sense to have both save_episodes and save_gifs set to True, since episodes can be watched (in better quality, even though the files are smaller) using WatchReplay.py.""" self.base_folder = base_folder self.models_folder_name = models_folder_name self.save_episodes = save_episodes self.save_episodes_folder = save_episodes_folder self.should_save_gifs = save_gifs self.gifs_folder_name = gifs_folder_name self.should_save_pi_predict_models = save_pi_predict_models self.should_save_v_predict_models = save_v_predict_models self.run_indefinitely = run_indefinitely self.max_nb_episodes = max_nb_episodes self.use_keras_gym_train_monitor = use_keras_gym_train_monitor self.models_folder = os.path.join(self.base_folder, self.models_folder_name) self.gifs_folder = os.path.join(self.base_folder, self.gifs_folder_name) if save_episodes_folder and not os.path.exists(save_episodes_folder): os.makedirs(save_episodes_folder) if models_folder_name and not os.path.exists(self.models_folder): os.makedirs(self.models_folder) if save_gifs and self.gifs_folder and not os.path.exists(self.gifs_folder): os.makedirs(self.gifs_folder) self.env = gym.make('Riverraid-v0') self.env = km.wrappers.ImagePreprocessor(self.env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH, grayscale=RL_PREPROCESS_GRAYSCALE) self.env = km.wrappers.FrameStacker(self.env, num_frames=RL_PREPROCESS_NUM_FRAMES) if use_keras_gym_train_monitor: self.env = km.wrappers.TrainMonitor(self.env) # show logs from TrainMonitor km.enable_logging() # function approximators self.func = km.predefined.AtariFunctionApproximator(self.env) self.pi = km.SoftmaxPolicy(self.func, update_strategy=RL_PI_UPDATE_STRATEGY) # PPO self.v = km.V(self.func, gamma=RLTrainer.GAMMA, bootstrap_with_target_model=RLTrainer.BOOTSTRAP_WITH_TARGET_MODEL, bootstrap_n=RLTrainer.BOOTSTRAP_N) self.actor_critic = km.ActorCritic(self.pi, self.v) # we'll use this to temporarily store our experience self.buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=self.v, capacity=RLTrainer.BUFFER_CAPACITY, batch_size=RLTrainer.BUFFER_BATCH_SIZE) if load_pi_predict_model: self.load_pi_predict_model_weights() if load_v_predict_model: self.load_v_predict_model_weights()
def test_atari_ppo(): # env with preprocessing env = gym.make('PongDeterministic-v4') env = km.wrappers.ImagePreprocessor(env, height=105, width=80, grayscale=True) env = km.wrappers.FrameStacker(env, num_frames=3) env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() func = Func(env, lr=0.00025) pi = km.SoftmaxPolicy(function_approximator=func, update_strategy='ppo') v = km.V(function_approximator=func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True) actor_critic = km.ActorCritic(pi, v) # we'll use this to temporarily store our experience buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=v, capacity=256, batch_size=64) # run episodes while env.T < 500000: s = env.reset() for t in range(env.spec.max_episode_steps): a = pi(s, use_target_model=True) # target_model == pi_old s_next, r, done, info = env.step(a) buffer.add(s, a, r, done, env.ep) if len(buffer) >= buffer.capacity: # use 4 epochs per round num_batches = int(4 * buffer.capacity / buffer.batch_size) for _ in range(num_batches): actor_critic.batch_update(*buffer.sample()) buffer.clear() # soft update (tau=1 would be a hard update) actor_critic.sync_target_model(tau=0.1) if done: break s = s_next if env.G > 0: break assert env.T < 500000, "test_atari_ppo didn't converge"
def __init__(self, predict_model_weights_path: str): env = gym.make("Riverraid-v0") # Dummy so that we can make pi below. env = keras_gym.wrappers.ImagePreprocessor( env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH, grayscale=RL_PREPROCESS_GRAYSCALE) # The actual preprocessing will be done using the preprocess parameter for super().__init__. # This way we can take 'normal' screens as input. env = keras_gym.wrappers.FrameStacker( env, num_frames=RL_PREPROCESS_NUM_FRAMES) env = keras_gym.wrappers.TrainMonitor(env) func = keras_gym.predefined.AtariFunctionApproximator(env) self.pi = keras_gym.SoftmaxPolicy( func, update_strategy=RL_PI_UPDATE_STRATEGY) self.pi.predict_model.load_weights(predict_model_weights_path) super().__init__(lambda screen: self.pi(screen), False, True, RL_PREPROCESS_NUM_FRAMES)
self.model.save('REINFORCE_model.h5') def load_model(self, path): '''loads a trained model from path''' return load_model(path) if __name__ == "__main__": """agent=REINFORCE(env) agent.train(100) import matplotlib.pyplot as plt import math""" env = KSPPilot() function_approximator = MLP(env, lr=0.1) pi = km.SoftmaxPolicy(function_approximator, update_strategy='vanilla') v = km.V(function_approximator, gamma=0.9, bootstrap_n=1) # combine them into a single actor-critic actor_critic = km.ActorCritic(pi, v) for ep in range(100): s = env.reset() for t in range(10000): a = pi(s, use_target_model=True) s_next, r, done, info = env.step(a) # small incentive to keep moving if np.array_equal(s_next, s): r = -0.1 actor_critic.update(s, a, r, done)
env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() class LinearFunc(km.FunctionApproximator): """ linear function approximator (body only does one-hot encoding) """ def body(self, S): one_hot_encoding = keras.layers.Lambda(lambda x: K.one_hot(x, 16)) return one_hot_encoding(S) # define function approximators func = LinearFunc(env, lr=0.01) pi = km.SoftmaxPolicy(func, update_strategy='vanilla') cache = km.caching.MonteCarloCache(env, gamma=0.99) # static parameters num_episodes = 250 num_steps = 30 # train for ep in range(num_episodes): s = env.reset() cache.reset() for t in range(num_steps): a = pi(s)
def diff_transform(S): S = K.cast(S, 'float32') / 255 M = km.utils.diff_transform_matrix(num_frames=3) return K.dot(S, M) X = Lambda(diff_transform)(S) X = Conv2D(filters=16, kernel_size=8, strides=4, activation='relu')(X) X = Conv2D(filters=32, kernel_size=4, strides=2, activation='relu')(X) X = Flatten()(X) X = Dense(units=256, activation='relu')(X) return X # function approximators func = Func(env, lr=0.00025) pi = km.SoftmaxPolicy(func, update_strategy='ppo') v = km.V(func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True) actor_critic = km.ActorCritic(pi, v) # we'll use this to temporarily store our experience buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=v, capacity=256, batch_size=64) # run episodes while env.T < 3000000: s = env.reset() for t in range(env.spec.max_episode_steps): a = pi(s, use_target_model=True) # target_model == pi_old s_next, r, done, info = env.step(a)
import keras_gym as km import numpy as np env = km.envs.ConnectFourEnv() env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() # function approximators func = km.predefined.ConnectFourFunctionApproximator(env, lr=0.001) pi = km.SoftmaxPolicy(func, update_strategy='cross_entropy') v = km.V(func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True) ac = km.ActorCritic(pi, v) cache = km.caching.MonteCarloCache(env, gamma=1) # state_id = '20400000000000000099' # state_id = '2020000d2c2a86ce6400' # state_id = '10600000000000005609' # attack # state_id = '20600000000000004d7e' # defend # state_id = '106000000001a021e87f' # n = km.planning.MCTSNode(ac, state_id=state_id, random_seed=7) n = km.planning.MCTSNode(ac, random_seed=17, c_puct=3.5) n.env.render() for ep in range(1000): n.reset() for t in range(env.max_time_steps): n.search(n=14) n.show(2)
return X # environment [https://github.com/axb2035/gym-chase] env = gym.make('Chase-v0') env = ChasePreprocessor(env) env = km.wrappers.FrameStacker(env, num_frames=3) env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() # function approximators cnn = CNN(env, lr=0.00025) pi = km.SoftmaxPolicy(cnn, update_strategy='ppo') v = km.V(cnn, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True) actor_critic = km.ActorCritic(pi, v) buffer = km.caching.ExperienceReplayBuffer.from_value_function( value_function=v, capacity=256, batch_size=16) for ep in range(10000000): s = env.reset() for t in range(1000): a = actor_critic.policy(s, use_target_model=True) s_next, r, done, info = env.step(a) buffer.add(s, a, r, done, ep) if len(buffer) >= buffer.capacity: