예제 #1
0
파일: main.py 프로젝트: sherlockjian/TF2_RL
def train_eval(log_dir_name,
               random_seed,
               env_name="CartPole",
               eps_start=1.0,
               eps_end=0.02,
               decay_steps=3000,
               optimizer=tf.keras.optimizers.RMSprop,
               learning_rate=0.00025,
               decay=0.95,
               momentum=0.0,
               epsilon=0.00001,
               centered=True,
               loss_fn=tf.compat.v1.losses.huber_loss,
               grad_clip_flg=None,
               num_frames=10000,
               train_freq=1,
               memory_size=5000,
               hot_start=100,
               sync_freq=1000,
               batch_size=32,
               interval_MAR=10,
               gamma=0.99,
               num_eval_episodes=1,
               eval_interval=1000):
    # init global time-step
    global_timestep = tf.compat.v1.train.create_global_step()

    # instantiate annealing funcs for ep and lr
    anneal_ep = tf.compat.v1.train.polynomial_decay(eps_start, global_timestep,
                                                    decay_steps, eps_end)

    # prep for training
    log_dir = set_up_for_training(log_dir_name=log_dir_name,
                                  env_name=env_name,
                                  seed=random_seed)
    env = prep_env(env_name=env_name, video_path=log_dir["video_path"])
    replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"])
    reward_buffer = deque(maxlen=interval_MAR)
    summary_writer = tf.compat.v2.summary.create_file_writer(
        log_dir["summary_path"])

    agent = Double_DQN(
        model=prep_model(env_name),
        policy=EpsilonGreedyPolicy_eager(dim_action=env.action_space.n,
                                         epsilon_fn=anneal_ep),
        optimizer=optimizer(learning_rate, decay, momentum, epsilon, centered),
        loss_fn=loss_fn,
        grad_clip_fn=gradient_clip_fn(flag=grad_clip_flg),
        num_action=env.action_space.n,
        model_dir=log_dir["model_path"],
        gamma=gamma,
        obs_prc_fn=prep_obs_processor(env_name))

    train(global_timestep, agent, env, replay_buffer, reward_buffer,
          summary_writer, num_eval_episodes, num_frames, eval_interval,
          hot_start, train_freq, batch_size, sync_freq, interval_MAR)
params.alpha = 0.6
params.beta_start = 0.4
params.beta_end = 1.0
params.prioritized_replay_noise = 1e-6

# init global time-step
global_timestep = tf.train.get_or_create_global_step()

# instantiate annealing funcs for ep and lr
anneal_ep = tf.train.polynomial_decay(params.ep_start, global_timestep, params.decay_steps, params.ep_end)
anneal_lr = tf.train.polynomial_decay(params.lr_start, global_timestep, params.decay_steps, params.lr_end)
beta = tf.train.polynomial_decay(params.beta_start, global_timestep, params.decay_steps, params.beta_end)

# prep for training
policy = EpsilonGreedyPolicy_eager(Epsilon_fn=anneal_ep)
optimizer = tf.train.RMSPropOptimizer(anneal_lr, 0.99, 0.0, 1e-6)
replay_buffer = PrioritizedReplayBuffer(params.memory_size, alpha=params.alpha)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
loss_fn = create_loss_func(params.loss_fn)
grad_clip_fn = gradient_clip_fn(flag=params.grad_clip_flg)

# create a directory for log/model
params = create_log_model_directory(params, get_alg_name())
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)

# choose env and instantiate the agent correspondingly
agent, env = invoke_agent_env(params, get_alg_name())
agent = eval(agent)(Model, optimizer, loss_fn, grad_clip_fn, env.action_space.n, params)

train_DQN_PER(agent, env, policy, replay_buffer, reward_buffer, beta, summary_writer)