def test():
    config_path = "config.yml"
    print_config(config_path)
    config = load_config(config_path)
    training_config = config["training_config"]
    config["model_config"]["load_model"] = True

    env_id = "DoublePendulum-v0"
    env = GentlyTerminating(gym.make(env_id))

    n_episodes = 10
    max_episode_step = 10000
    print("*********************************************")
    print(
        "Testing the model for 10 episodes with 10000 maximum steps per episode"
    )
    print("*********************************************")

    policy = Policy(env, config)

    losses = []
    all_rewards = []
    avg_rewards = []
    epsilons = []
    for i_episode in range(n_episodes):
        episode_reward = 0
        state = env.reset()
        state[4] /= 10
        epsilon = 0
        epsilons.append(epsilon)
        for step in range(max_episode_step):
            env.render()
            time.sleep(0.01)
            action = policy.act(state, epsilon)

            f_action = 6 * (action - (policy.n_actions - 1) / 2) / (
                (policy.n_actions - 1) / 2)
            next_state, reward, done, _ = env.step(f_action)
            reward = 10 * reward
            next_state[4] /= 10

            policy.replay_buffer.push(state, action[0], reward, next_state,
                                      done)

            state = next_state
            episode_reward += reward

            if done:
                break
        print(" episode: %s, episode reward: %s" % (i_episode, episode_reward))
        all_rewards.append(episode_reward)
        avg_rewards.append(np.mean(all_rewards[-3:]))

    env.close()
    plot_fig(n_episodes, all_rewards, avg_rewards, losses)
def train():
    '''Load the configuration setttings'''
    config_path = "config.yml"
    print_config(config_path)
    config = load_config(config_path)
    training_config = config["training_config"]
    seed = training_config["random_seed"]
    n_episodes = training_config["n_episodes"]
    max_episode_step = training_config["max_episode_step"]
    n_update_target = training_config["n_update_target"]
    exp_number = training_config["exp_number"]
    save_model_path = training_config["save_model_path"]
    render_flag = training_config["render"]
    save_best = training_config["save_best"]
    '''Use fixed epsilon or use a exponential function decay?'''
    if training_config["use_fix_epsilon"]:
        epsilon_by_frame = lambda frame_idx: training_config["fix_epsilon"]
    else:
        epsilon_start = training_config["epsilon_start"]
        epsilon_final = training_config["epsilon_final"]
        epsilon_decay = training_config["epsilon_decay"]
        epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * np.exp(-1. * frame_idx /
                                                    epsilon_decay)
    torch.manual_seed(seed)
    np.random.seed(seed)
    '''Environment initialization'''
    env_id = "Qube-v0"
    env = GentlyTerminating(gym.make(env_id))
    '''Initialize the DQN algorithm object'''
    policy = Policy(env, config)
    losses = []
    all_rewards = []
    avg_rewards = []
    epsilons = []
    '''Training the q-network with n episodes'''
    for i_episode in range(n_episodes):
        episode_reward = 0
        state = env.reset()
        state[4:6] /= 20
        epsilon = epsilon_by_frame(i_episode)
        epsilons.append(epsilon)
        for step in range(max_episode_step):
            if render_flag:
                env.render()
            '''Choose action'''
            action = policy.act(state, epsilon)
            f_action = 5 * (action - (policy.n_actions - 1) / 2) / (
                (policy.n_actions - 1) / 2)
            next_state, reward, done, _ = env.step(f_action)
            reward = 100 * (reward)
            next_state[4:6] /= 20
            policy.replay_buffer.push(state, action[0], reward, next_state,
                                      done)
            state = next_state
            episode_reward += reward

            if done:
                break

            if len(policy.replay_buffer) > policy.batch_size:
                loss = policy.train()
                losses.append(loss.item())

        all_rewards.append(episode_reward)
        avg_rewards.append(np.mean(all_rewards[-10:]))

        if i_episode % 50 == 0:
            '''Save the results figure every 50 episodes'''
            save_fig(i_episode, all_rewards, avg_rewards, losses, epsilons,
                     exp_number)

        if i_episode % n_update_target == 0:
            '''Update the target network'''
            policy.update_target()

        policy.save_model(save_model_path)
        if save_best and i_episode > 100:
            ratio = 1.1
            if episode_reward > ratio * np.mean(all_rewards[-10:]):
                print("Save model with episode reward %s " % (episode_reward))
                print("Model path: %s " % (save_model_path))
                break

    env.close()
Пример #3
0
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)

running_state = ZFilter((num_inputs,), clip=5)
running_reward = ZFilter((1,), demean=False, clip=10)

for i_episode in range(1,4001):
    memory = Memory()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        state = running_state(state)

        reward_sum = 0
        for t in range(10000): # Don't infinite loop while learning
            action = select_action(state)
            action = action.data[0].numpy()
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward

            next_state = running_state(next_state)

            mask = 1
            if done:
                mask = 0