def train_eval(log_dir="DDPG", prev_log="", google_colab=False, seed=123, gpu_id=0, env_name="HalfCheetah-v2", num_frames=10000, tau=1e-2, memory_size=5000, hot_start=100, batch_size=200, interval_MAR=10, gamma=0.99, L2_reg=0.5, random_process="ou", mu=0.3, sigma=0.2, num_eval_episodes=1, eval_interval=1000): tf.compat.v1.set_random_seed(seed) np.random.seed(seed=seed) # prep for training log_dir = set_up_for_training(env_name=env_name, seed=seed, gpu_id=gpu_id, log_dir=log_dir, prev_log=prev_log, google_colab=google_colab) env = gym.make(env_name) env = Monitor(env=env, directory=log_dir["video_path"], force=True) replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"]) reward_buffer = deque(maxlen=interval_MAR) summary_writer = tf.compat.v2.summary.create_file_writer( log_dir["summary_path"]) if random_process == "ou": random_process = OrnsteinUhlenbeckProcess( size=env.action_space.shape[0], theta=0.15, mu=mu, sigma=sigma) elif random_process == "gaussian": random_process = GaussianNoise(mu=mu, sigma=sigma) else: random_process = False assert False, "choose the random process from either gaussian or ou" agent = DDPG(actor=Actor, critic=Critic, num_action=env.action_space.shape[0], random_process=random_process, gamma=gamma, L2_reg=L2_reg, actor_model_dir=log_dir["model_path"] + "/actor", critic_model_dir=log_dir["model_path"] + "/critic") train(agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, tau, eval_interval, hot_start, batch_size, interval_MAR, log_dir, google_colab)
def train_eval(log_dir_name, random_seed, env_name="CartPole", eps_start=1.0, eps_end=0.02, decay_steps=3000, optimizer=tf.keras.optimizers.RMSprop, learning_rate=0.00025, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True, loss_fn=tf.compat.v1.losses.huber_loss, grad_clip_flg=None, num_frames=10000, train_freq=1, memory_size=5000, hot_start=100, sync_freq=1000, batch_size=32, interval_MAR=10, gamma=0.99, num_eval_episodes=1, eval_interval=1000): # init global time-step global_timestep = tf.compat.v1.train.create_global_step() # instantiate annealing funcs for ep and lr anneal_ep = tf.compat.v1.train.polynomial_decay(eps_start, global_timestep, decay_steps, eps_end) # prep for training log_dir = set_up_for_training(log_dir_name=log_dir_name, env_name=env_name, seed=random_seed) env = prep_env(env_name=env_name, video_path=log_dir["video_path"]) replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"]) reward_buffer = deque(maxlen=interval_MAR) summary_writer = tf.compat.v2.summary.create_file_writer( log_dir["summary_path"]) agent = Double_DQN( model=prep_model(env_name), policy=EpsilonGreedyPolicy_eager(dim_action=env.action_space.n, epsilon_fn=anneal_ep), optimizer=optimizer(learning_rate, decay, momentum, epsilon, centered), loss_fn=loss_fn, grad_clip_fn=gradient_clip_fn(flag=grad_clip_flg), num_action=env.action_space.n, model_dir=log_dir["model_path"], gamma=gamma, obs_prc_fn=prep_obs_processor(env_name)) train(global_timestep, agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, eval_interval, hot_start, train_freq, batch_size, sync_freq, interval_MAR)
def train_eval(log_dir="PytorchDQN", prev_log="", seed=123, gpu_id=0, env_name="Pong", eps_start=1.0, eps_end=0.01, learning_rate=1e-4, decay_rate=0.1, num_frames=1000000, train_freq=4, memory_size=10000, hot_start=10000, sync_freq=1000, batch_size=32, interval_MAR=100, gamma=0.99, num_eval_episodes=1, eval_interval=250000, cuda=True): # init global time-step global_timestep = 0 # instantiate annealing funcs for ep anneal_ep = linear_schedule(int(num_frames * decay_rate), eps_end, eps_start) # prep for training log_dir = set_up_for_training(env_name=env_name, seed=seed, gpu_id=gpu_id, log_dir=log_dir, prev_log=prev_log) env = prep_env(env_name=env_name, video_path=log_dir["video_path"]) replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"]) reward_buffer = deque(maxlen=interval_MAR) summary_writer = SummaryWriter(log_dir=log_dir["summary_path"]) agent = dqn_agent(num_action=env.action_space.n, policy=EpsilonGreedyPolicy_torch( num_action=env.action_space.n, epsilon_fn=anneal_ep), summary_writer=summary_writer, learning_rate=learning_rate, gamma=gamma, model_path=log_dir["model_path"], cuda=cuda) train(global_timestep, agent, env, replay_buffer, reward_buffer, summary_writer, num_eval_episodes, num_frames, eval_interval, hot_start, train_freq, batch_size, sync_freq, interval_MAR)
mu = str(params.mu).split(".") mu = str(mu[0] + mu[1]) params.log_dir = "../../logs/logs/DDPG_batchnorm-{}-seed{}/{}-mu{}".format( params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu) params.actor_model_dir = "../../logs/models/DDPG_batchnorm-{}-seed{}/{}/actor-mu{}/".format( params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu) params.critic_model_dir = "../../logs/models/DDPG_batchnorm-{}-seed{}/{}/critic-mu{}/".format( params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu) params.video_dir = "../../logs/video/DDPG_batchnorm-{}-seed{}/{}-mu{}/".format( params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu) params.plot_path = "../../logs/plots/DDPG_batchnorm-{}-seed{}/{}-mu{}/".format( params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu) env = gym.make(params.env_name) env = Monitor(env, params.video_dir) # set seed env.seed(params.seed) tf.random.set_random_seed(params.seed) replay_buffer = ReplayBuffer(params.memory_size) reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) random_process = OrnsteinUhlenbeckProcess(size=env.action_space.shape[0], theta=0.15, mu=params.mu, sigma=params.sigma) # random_process = GaussianNoise(mu=params.mu, sigma=params.sigma) agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params) train_DDPG_original(agent, env, replay_buffer, reward_buffer, summary_writer)
if params.debug_flg: params.log_dir = "../logs/logs/" + now.strftime("%Y%m%d-%H%M%S") + "-DDPG/" params.model_dir = "../logs/models/" + now.strftime("%Y%m%d-%H%M%S") + "-DDPG/" else: params.log_dir = "../logs/logs/{}".format(params.env_name) params.model_dir = "../logs/models/{}".format(params.env_name) env = gym.make(params.env_name) # set seed env.seed(params.seed) tf.compat.v1.random.set_random_seed(params.seed) agent = DDPG(Actor, Critic, env.action_space.shape[0], params) replay_buffer = ReplayBuffer(params.memory_size) reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) init_state = env.reset() # reset agent.predict(init_state) # burn the format of the input matrix to get the weight matrices!! gp_model, update = create_bayes_net() optimiser = tf.compat.v1.train.AdamOptimizer() num_sample = 100 # number of sampling get_ready(agent.params) global_timestep = tf.compat.v1.train.get_or_create_global_step() time_buffer = deque(maxlen=agent.params.reward_buffer_ep) log = logger(agent.params)
from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari size = 100000 env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory = ReplayBuffer(size=size, traj_dir="./traj/") state = env.reset() action = env.action_space.sample() next_state, reward, done, info = env.step(action) env.close() for _ in range(size): memory.add(state, action, reward, next_state, done) print(len(memory)) memory.save() del memory memory = ReplayBuffer(size=size, recover_data=True, traj_dir="./traj/") print(len(memory))
from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari size = 1000 env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory = ReplayBuffer(size, n_step=5, flg_seq=True) print("Memory contains {0} timesteps".format(len(memory))) state = env.reset() action = env.action_space.sample() next_state, reward, done, info = env.step(action) env.close() for _ in range(size): memory.add(state, action, reward, next_state, done) print(len(memory)) memory.save() print("Memory contains {0} timesteps".format(len(memory))) states, actions, rewards, next_states, dones = memory.sample(batch_size=10) print(states.shape, state.shape) for _ in range(size): memory.sample(batch_size=10)
import tensorflow as tf from tf_rl.common.memory_tf import ReplayBuffer as ReplayBuffer_tf from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) state = env.reset() memory_tf = ReplayBuffer_tf(capacity=1000, n_step=0, act_shape=(), obs_shape=state.shape, obs_dtype=tf.int8, checkpoint_dir="./tmp") memory = ReplayBuffer(size=1000) done = False for t in range(100): action = env.action_space.sample() next_state, reward, done, info = env.step(action) memory.add(state, action, reward, next_state, done) memory_tf.add(state, action, reward, next_state, done) state = next_state env.close() print("=== test ===") """ Note: I have conducted the performance test where we repeat sampling from the Replay Buffer over 1000 times. And measured the exec time to compare Eager and Eager with Tf.function. Result: without function: 9.03s
with tf.GradientTape() as tape: preds = self.network(states) loss = tf.losses.mean_squared_error(expert_action, preds) # get gradients grads = tape.gradient(loss, self.network.trainable_variables) # apply processed gradients to the network self.optimizer.apply_gradients( zip(grads, self.network.trainable_variables)) return tf.math.reduce_mean(loss) if __name__ == '__main__': env = gym.make("CartPole-v0") buffer = ReplayBuffer(size=1000) agent = Agent() expert = dqn_agent(model=cartpole_net, policy=EpsilonGreedyPolicy_eager( num_action=env.action_space.n, epsilon_fn=lambda: tf.constant(0.02)), optimizer=tf.compat.v1.train.AdamOptimizer(), loss_fn=tf.compat.v1.losses.huber_loss, grad_clip_fn=lambda x: x, num_action=env.action_space.n, model_dir="./expert", gamma=0.99, obs_prc_fn=lambda x: x) reward_total = list() @tf.function
from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory = ReplayBuffer(1000, n_step=5, flg_seq=True) print("Memory contains {0} timesteps".format(len(memory))) for i in range(1): state = env.reset() for t in range(1000): # env.render() action = env.action_space.sample() next_state, reward, done, info = env.step(action) memory.add(state, action, reward, next_state, done) state = next_state if done: print("Episode finished after {} timesteps".format(t + 1)) print("Memory contains {0} timesteps".format(len(memory))) break env.close() print("Memory contains {0} timesteps".format(len(memory))) state, action, reward, next_state, done = memory.sample(batch_size=10) print(state.shape, action.shape)
import gym from tf_rl.common.memory import ReplayBuffer from tf_rl.common.utils import Tracker env = gym.make("CartPole-v0") memory = ReplayBuffer(1000) tracker = Tracker(save_freq=100) for i in range(100): state = env.reset() for t in range(100): # env.render() action = env.action_space.sample() next_state, reward, done, info = env.step(action) # memory format is: state, action, reward, next_state, done memory.add(state, action, reward, next_state, done) # format is: state, q_value, action, reward, done, loss, gradient tracker.store('state', state) tracker.store('q_value', 0.2) tracker.store('action', action) tracker.store('reward', reward) tracker.store('done', done) tracker.store('loss', 0.3) if done: print("Episode finished after {} timesteps".format(t + 1)) break state = next_state
from tf_rl.common.wrappers import wrap_deepmind, make_atari from tf_rl.common.memory import ReplayBuffer # for env_name , goal_score in ENV_LIST_NIPS.items(): env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory_size = 1000 replay_buffer = ReplayBuffer(memory_size) state = env.reset() for t in range(memory_size): env.render() action = env.action_space.sample() next_state, reward, done, info = env.step(action) replay_buffer.add(state, action, reward, next_state, done) state = next_state if t % 10000 == 0: print(t) env.close() # replay_buffer.save(dir="./buffer.json")