示例#1
0
def test_atari_preprocessing_scale(env_fn):
    # arbitrarily chosen number for stepping into env. and ensuring all observations are in the required range
    max_test_steps = 10

    for grayscale in [True, False]:
        for scaled in [True, False]:
            env = AtariPreprocessing(env_fn(),
                                     screen_size=84,
                                     grayscale_obs=grayscale,
                                     scale_obs=scaled,
                                     frame_skip=1,
                                     noop_max=0)
            obs = env.reset().flatten()
            done, step_i = False, 0
            max_obs = 1 if scaled else 255
            assert (0 <= obs).all() and (obs <= max_obs).all(
            ), 'Obs. must be in range [0,{}]'.format(max_obs)
            while not done or step_i <= max_test_steps:
                obs, _, done, _ = env.step(env.action_space.sample())
                obs = obs.flatten()
                assert (0 <= obs).all() and (obs <= max_obs).all(
                ), 'Obs. must be in range [0,{}]'.format(max_obs)
                step_i += 1

            env.close()
示例#2
0
def test_atari_preprocessing_grayscale(env_fn):
    import cv2
    env1 = env_fn()
    env2 = AtariPreprocessing(env_fn(),
                              screen_size=84,
                              grayscale_obs=True,
                              frame_skip=1,
                              noop_max=0)
    env3 = AtariPreprocessing(env_fn(),
                              screen_size=84,
                              grayscale_obs=False,
                              frame_skip=1,
                              noop_max=0)
    env1.seed(0)
    env2.seed(0)
    env3.seed(0)
    obs1 = env1.reset()
    obs2 = env2.reset()
    obs3 = env3.reset()
    assert obs1.shape == (210, 160, 3)
    assert obs2.shape == (84, 84)
    assert obs3.shape == (84, 84, 3)
    assert np.allclose(
        obs3, cv2.resize(obs1, (84, 84), interpolation=cv2.INTER_AREA))
    obs3_gray = cv2.cvtColor(obs3, cv2.COLOR_RGB2GRAY)
    # the edges of the numbers do not render quite the same in the grayscale, so we ignore them
    assert np.allclose(obs2[10:38], obs3_gray[10:38])
    # the paddle also do not render quite the same
    assert np.allclose(obs2[44:], obs3_gray[44:])

    env1.close()
    env2.close()
    env3.close()
def test_atari_preprocessing_grayscale(env_fn):
    import cv2
    env1 = env_fn()
    env2 = AtariPreprocessing(env_fn(),
                              screen_size=84,
                              grayscale_obs=True,
                              frame_skip=1,
                              noop_max=0)
    env3 = AtariPreprocessing(env_fn(),
                              screen_size=84,
                              grayscale_obs=False,
                              frame_skip=1,
                              noop_max=0)
    env1.reset()
    # take these steps to imitate actions of FireReset logic
    env1.step(1)
    obs1 = env1.step(2)[0]
    obs2 = env2.reset()
    obs3 = env3.reset()
    assert obs1.shape == (210, 160, 3)
    assert obs2.shape == (84, 84)
    assert obs3.shape == (84, 84, 3)
    np.testing.assert_allclose(
        obs3, cv2.resize(obs1, (84, 84), interpolation=cv2.INTER_AREA))
    obs3_gray = cv2.cvtColor(obs3, cv2.COLOR_RGB2GRAY)
    # the edges of the numbers do not render quite the same in the grayscale, so we ignore them
    np.testing.assert_allclose(obs2[10:], obs3_gray[10:])

    env1.close()
    env2.close()
    env3.close()
示例#4
0
    for n_epi in range(100):
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                env.render()
                s = np.array(s).reshape(shape)

                od = model(torch.from_numpy(s).float())
                prob = od['pi']
                #print(prob)
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = env.step(a)
                trn = (s.reshape(shape0), a, r / 100.0, np.array(s_prime),
                       prob[0][a].item(), done)
                model.put_data(trn)
                s = s_prime
                score += r
                if done:
                    break

            model.train_net()

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.1f}".format(
                n_epi, score / print_interval))
            score = 0.0

    env.close()
示例#5
0
def main(args):
    env = gym.make(args.env)
    # Rescale images to 42x42 and turn into greyscale
    env = AtariPreprocessing(env,
                             screen_size=42,
                             grayscale_obs=True,
                             noop_max=1,
                             terminal_on_life_loss=True)

    # A quick trick to give agent some sense of history/motion:
    # Give N successive frames instead of just one to the agent.
    # This deque will store N last frames to do this.
    state_stacker = deque(maxlen=FRAME_STACK_SIZE)
    new_deque = deque(maxlen=100)

    # Build models according to image shape and number of actions
    # that are available.
    # If we are evaluating, load existing model instead
    state_shape = RESOLUTION + (FRAME_STACK_SIZE, )
    model = None
    target_model = None
    if not args.evaluate:
        # Construct new models
        model, target_model = build_models(state_shape, env.action_space.n)
    else:
        # Load existing model
        model = keras.models.load_model(args.model_path)

    # Initialize replay memory (if training)
    replay_memory = None
    if not args.evaluate:
        replay_memory = ReplayMemory(REPLAY_SIZE, state_shape)

    # Open log file if we want to output results
    log_file = None
    if args.log is not None:
        log_file = open(args.log, "w")

    # Main training loop
    step_ctr = 0
    q_values_counter = 0
    q_values_summation = 0
    while step_ctr < args.steps:
        terminal = False
        episode_reward = 0
        # Keep track of losses
        losses = []

        # Reset frame stacker to empty frames
        state_stacker.clear()
        for i in range(FRAME_STACK_SIZE):
            state_stacker.append(np.zeros(RESOLUTION + (1, )))

        s1 = env.reset()
        # Preprocess state
        s1 = preprocess_state(s1, state_stacker)
        while not terminal:
            action, q_values = get_action(s1, model, env.action_space.n)
            # TODO
            # Here you might want to store q_values somewhere
            # for later plotting
            s2, reward, terminal, info = env.step(action)
            #print(reward)
            s2 = preprocess_state(s2, state_stacker)
            step_ctr += 1
            # Count episodic reward
            episode_reward += reward

            if args.show:
                env.render()

            # Skip training/replay memory stuff if we are evaluating
            if not args.evaluate:
                # Store the experience to replay memory
                replay_memory.add_experience(s1, action, reward, s2, terminal)

                # Check if we should do updates or saving model
                if (step_ctr % UPDATE_RATE) == 0:
                    if replay_memory.num_total > SAMPLES_TILL_TRAIN:
                        losses.append(
                            update_model(model, target_model, replay_memory))
                if (step_ctr % TARGET_UPDATE_RATE) == 0:
                    update_target_model(model, target_model)
                if (step_ctr % SAVE_MODEL_EVERY_STEPS) == 0:
                    model.save(args.model_path)

            # s2 becomes s1 for the next iteration
            s1 = s2

            # If we want to limit fps, sleep little bit
            if args.limit_fps:
                sleep(1 / 35.0)

        # storing another collection
        #storer_deque = []
        new_deque.append(episode_reward)

        # To avoid div-by-zero
        if len(losses) == 0: losses.append(0.0)

        # TODO
        #  1) Print out average training loss
        #  2) Track average reward over last 100 episodes
        #  3) Track average Q-value of this episode
        print('Average of q_values:  ', np.average(q_values))

        # TODO average loss
        # Losses from previous episodes are already stored in list `losses`.
        # Compute average loss and include it in the printout below
        q_values_counter += len(q_values)
        q_values_summation += np.sum(q_values)
        print('Average of losses: ', np.average(losses))
        print('Average of first 100 revolts: ', np.average(new_deque))
        running_average_q_values = q_values_summation / q_values_counter
        print('Running average of the q_values: ', running_average_q_values)
        # Legend:
        #  - Episode reward: Reward from the previous episode
        #  - Steps: Total number of agent steps taken in thins training
        s = "Episode reward: {:.1f}\tSteps: {}\t".format(
            episode_reward,
            step_ctr,
        )
        # Print our log message
        print(s)
        # If we have a log file, print it there as well
        if log_file is not None:
            log_file.write(s + "\n")

    env.close()