Exemplo n.º 1
0
    best_score = -np.inf
    scores = []
    eps_history = []

    state_dims = env.observation_space.shape[0]
    num_actions = env.action_space.n
    lr = 0.001
    gamma = 0.99
    agent = PolicyGradientAgent(lr=lr,
                                gamma=gamma,
                                state_dims=state_dims,
                                num_actions=num_actions,
                                env_name='lunar_lander',
                                checkpoint_dir='temp/')
    if test_mode:
        agent.load_model()

    # env = gym.wrappers.Monitor(env, 'temp/lunar_lander',
    #                             video_callable=lambda episode_id: True, force=True)

    for count in range(num_games):
        state = env.reset()
        done = False
        score = 0

        while not done:
            env.render()
            action = agent.get_action(state)
            new_state, reward, done, _ = env.step(action)
            agent.reward_history.append(reward)