예제 #1
0
파일: main.py 프로젝트: arame/Pacman_SAC_D
def main():
    Hyper.init()
    env = make_env(
        Constants.env_id)  # See wrapper code for environment in atari_image.py
    Hyper.n_actions = env.action_space.n
    shape = (env.observation_space.shape)
    agent = Agent(input_dims=shape, env=env, n_actions=env.action_space.n)
    filename = f"{Constants.env_id}_games{Hyper.n_games}_alpha{Hyper.alpha}.png"
    figure_file = f'plots/{filename}'

    best_ave_score = env.reward_range[0]
    best_score = 0
    score_history = []
    load_checkpoint = False
    if load_checkpoint:
        agent.load_models()
        env.render(mode='human')
    total_steps = 0
    game_id = 0
    for i in range(Hyper.n_games):
        game_id += 1
        if game_id % 20 == 0:
            Hyper.alpha = Hyper.alpha * 1.2
            Hyper.beta = Hyper.beta * 1.2
        observation = env.reset()
        done = False
        steps = 0
        score = 0
        while not done:
            # Sample action from the policy
            action = agent.choose_action(observation)

            # Sample transition from the environment
            new_observation, reward, done, info = env.step(action)
            steps += 1
            total_steps += 1

            # Store transition in the replay buffer
            agent.remember(observation, action, reward, new_observation, done)
            if not load_checkpoint:
                agent.learn()
            score += reward
            observation = new_observation
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        if score > best_score:
            best_score = score

        if avg_score > best_ave_score:
            best_ave_score = avg_score
            if not load_checkpoint:
                agent.save_models()

        episode = i + 1
        print(
            f"episode {episode}: score {score}, best_score {best_score}, best ave score {best_ave_score}, trailing 100 games avg {avg_score}, steps {steps}, total steps {total_steps}"
        )

    print(f"total number of steps taken: {total_steps}")
    if not load_checkpoint:
        x = [i + 1 for i in range(Hyper.n_games)]
        plot_learning_curve(x, score_history, figure_file)
        observation = one_hot_single_value(observation, n_states)
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, _ = env.step(action)
            observation_ = one_hot_single_value(observation_, n_states)
            score += reward
            agent.remember(observation, action, reward, observation_, done)
            if not load_checkpoint:
                pass
            observation = observation_
        agent.learn()
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            if not load_checkpoint:
                agent.save_models()

        print('episode ', i, 'score %.1f' % score,
              'avg_score %.1f' % avg_score)

    if not load_checkpoint:
        x = [i + 1 for i in range(n_games)]
        plot_learning_curve(x, score_history, figure_file)

    with open('scores/score_history__.p', 'wb') as fp:
        pickle.dump(score_history, fp)
예제 #3
0
    rewards = [model.train_on_env(env) for _ in range(100)]
    mean_rewards.append(np.mean(rewards))
    print("mean reward:%.3f" % (np.mean(rewards)))
    plt.figure(figsize=[9, 6])
    plt.title("Mean reward per 100 games")
    plt.plot(mean_rewards)
    plt.grid()
    # plt.show()
    plt.savefig('plots/SAC_learning_curve.png')
    plt.close()

    if np.mean(rewards) >= 1000:
        print("TRAINED!")
        break

model.save_models()
#model.load("experts/saved_expert/pg.model")

num_expert = 100
states = np.array([])
probs = np.array([])
actions = np.array([])
for i in range(num_expert):
    state, prob, action, _ = model.generate_session(env)
    states = np.concatenate((states, state.reshape(-1)))
    probs = np.concatenate((probs, prob))
    actions = np.concatenate((actions, action))
states = states.reshape(-1, 5)
np.save('expert_samples/sac_inverted_pendulum_states', states)
np.save('expert_samples/sac_inverted_pendulum_actions', actions)
np.save('expert_samples/sac_inverted_pendulum_probs', probs)