def test(): config_path = "config.yml" print_config(config_path) config = load_config(config_path) training_config = config["training_config"] config["model_config"]["load_model"] = True env_id = "DoublePendulum-v0" env = GentlyTerminating(gym.make(env_id)) n_episodes = 10 max_episode_step = 10000 print("*********************************************") print( "Testing the model for 10 episodes with 10000 maximum steps per episode" ) print("*********************************************") policy = Policy(env, config) losses = [] all_rewards = [] avg_rewards = [] epsilons = [] for i_episode in range(n_episodes): episode_reward = 0 state = env.reset() state[4] /= 10 epsilon = 0 epsilons.append(epsilon) for step in range(max_episode_step): env.render() time.sleep(0.01) action = policy.act(state, epsilon) f_action = 6 * (action - (policy.n_actions - 1) / 2) / ( (policy.n_actions - 1) / 2) next_state, reward, done, _ = env.step(f_action) reward = 10 * reward next_state[4] /= 10 policy.replay_buffer.push(state, action[0], reward, next_state, done) state = next_state episode_reward += reward if done: break print(" episode: %s, episode reward: %s" % (i_episode, episode_reward)) all_rewards.append(episode_reward) avg_rewards.append(np.mean(all_rewards[-3:])) env.close() plot_fig(n_episodes, all_rewards, avg_rewards, losses)
def train(): '''Load the configuration setttings''' config_path = "config.yml" print_config(config_path) config = load_config(config_path) training_config = config["training_config"] seed = training_config["random_seed"] n_episodes = training_config["n_episodes"] max_episode_step = training_config["max_episode_step"] n_update_target = training_config["n_update_target"] exp_number = training_config["exp_number"] save_model_path = training_config["save_model_path"] render_flag = training_config["render"] save_best = training_config["save_best"] '''Use fixed epsilon or use a exponential function decay?''' if training_config["use_fix_epsilon"]: epsilon_by_frame = lambda frame_idx: training_config["fix_epsilon"] else: epsilon_start = training_config["epsilon_start"] epsilon_final = training_config["epsilon_final"] epsilon_decay = training_config["epsilon_decay"] epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay) torch.manual_seed(seed) np.random.seed(seed) '''Environment initialization''' env_id = "Qube-v0" env = GentlyTerminating(gym.make(env_id)) '''Initialize the DQN algorithm object''' policy = Policy(env, config) losses = [] all_rewards = [] avg_rewards = [] epsilons = [] '''Training the q-network with n episodes''' for i_episode in range(n_episodes): episode_reward = 0 state = env.reset() state[4:6] /= 20 epsilon = epsilon_by_frame(i_episode) epsilons.append(epsilon) for step in range(max_episode_step): if render_flag: env.render() '''Choose action''' action = policy.act(state, epsilon) f_action = 5 * (action - (policy.n_actions - 1) / 2) / ( (policy.n_actions - 1) / 2) next_state, reward, done, _ = env.step(f_action) reward = 100 * (reward) next_state[4:6] /= 20 policy.replay_buffer.push(state, action[0], reward, next_state, done) state = next_state episode_reward += reward if done: break if len(policy.replay_buffer) > policy.batch_size: loss = policy.train() losses.append(loss.item()) all_rewards.append(episode_reward) avg_rewards.append(np.mean(all_rewards[-10:])) if i_episode % 50 == 0: '''Save the results figure every 50 episodes''' save_fig(i_episode, all_rewards, avg_rewards, losses, epsilons, exp_number) if i_episode % n_update_target == 0: '''Update the target network''' policy.update_target() policy.save_model(save_model_path) if save_best and i_episode > 100: ratio = 1.1 if episode_reward > ratio * np.mean(all_rewards[-10:]): print("Save model with episode reward %s " % (episode_reward)) print("Model path: %s " % (save_model_path)) break env.close()
kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1,), demean=False, clip=10) for i_episode in range(1,4001): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning action = select_action(state) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0