Exemplo n.º 1
0
    #env = monitor = Monitor(env, 'data/monitor',force=True, video_callable=lambda i: i % 1 != 0 )
    env = FrameStack(env, num_stack, False)

    episode_count = 1
    reward = 0
    done = False
    score = 0.0
    print_interval = 3

    for n_epi in range(100):
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                env.render()
                s = np.array(s).reshape(shape)

                od = model(torch.from_numpy(s).float())
                prob = od['pi']
                #print(prob)
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = env.step(a)
                trn = (s.reshape(shape0), a, r / 100.0, np.array(s_prime),
                       prob[0][a].item(), done)
                model.put_data(trn)
                s = s_prime
                score += r
                if done:
                    break
Exemplo n.º 2
0
def main(args):
    env = gym.make(args.env)
    # Rescale images to 42x42 and turn into greyscale
    env = AtariPreprocessing(env,
                             screen_size=42,
                             grayscale_obs=True,
                             noop_max=1,
                             terminal_on_life_loss=True)

    # A quick trick to give agent some sense of history/motion:
    # Give N successive frames instead of just one to the agent.
    # This deque will store N last frames to do this.
    state_stacker = deque(maxlen=FRAME_STACK_SIZE)
    new_deque = deque(maxlen=100)

    # Build models according to image shape and number of actions
    # that are available.
    # If we are evaluating, load existing model instead
    state_shape = RESOLUTION + (FRAME_STACK_SIZE, )
    model = None
    target_model = None
    if not args.evaluate:
        # Construct new models
        model, target_model = build_models(state_shape, env.action_space.n)
    else:
        # Load existing model
        model = keras.models.load_model(args.model_path)

    # Initialize replay memory (if training)
    replay_memory = None
    if not args.evaluate:
        replay_memory = ReplayMemory(REPLAY_SIZE, state_shape)

    # Open log file if we want to output results
    log_file = None
    if args.log is not None:
        log_file = open(args.log, "w")

    # Main training loop
    step_ctr = 0
    q_values_counter = 0
    q_values_summation = 0
    while step_ctr < args.steps:
        terminal = False
        episode_reward = 0
        # Keep track of losses
        losses = []

        # Reset frame stacker to empty frames
        state_stacker.clear()
        for i in range(FRAME_STACK_SIZE):
            state_stacker.append(np.zeros(RESOLUTION + (1, )))

        s1 = env.reset()
        # Preprocess state
        s1 = preprocess_state(s1, state_stacker)
        while not terminal:
            action, q_values = get_action(s1, model, env.action_space.n)
            # TODO
            # Here you might want to store q_values somewhere
            # for later plotting
            s2, reward, terminal, info = env.step(action)
            #print(reward)
            s2 = preprocess_state(s2, state_stacker)
            step_ctr += 1
            # Count episodic reward
            episode_reward += reward

            if args.show:
                env.render()

            # Skip training/replay memory stuff if we are evaluating
            if not args.evaluate:
                # Store the experience to replay memory
                replay_memory.add_experience(s1, action, reward, s2, terminal)

                # Check if we should do updates or saving model
                if (step_ctr % UPDATE_RATE) == 0:
                    if replay_memory.num_total > SAMPLES_TILL_TRAIN:
                        losses.append(
                            update_model(model, target_model, replay_memory))
                if (step_ctr % TARGET_UPDATE_RATE) == 0:
                    update_target_model(model, target_model)
                if (step_ctr % SAVE_MODEL_EVERY_STEPS) == 0:
                    model.save(args.model_path)

            # s2 becomes s1 for the next iteration
            s1 = s2

            # If we want to limit fps, sleep little bit
            if args.limit_fps:
                sleep(1 / 35.0)

        # storing another collection
        #storer_deque = []
        new_deque.append(episode_reward)

        # To avoid div-by-zero
        if len(losses) == 0: losses.append(0.0)

        # TODO
        #  1) Print out average training loss
        #  2) Track average reward over last 100 episodes
        #  3) Track average Q-value of this episode
        print('Average of q_values:  ', np.average(q_values))

        # TODO average loss
        # Losses from previous episodes are already stored in list `losses`.
        # Compute average loss and include it in the printout below
        q_values_counter += len(q_values)
        q_values_summation += np.sum(q_values)
        print('Average of losses: ', np.average(losses))
        print('Average of first 100 revolts: ', np.average(new_deque))
        running_average_q_values = q_values_summation / q_values_counter
        print('Running average of the q_values: ', running_average_q_values)
        # Legend:
        #  - Episode reward: Reward from the previous episode
        #  - Steps: Total number of agent steps taken in thins training
        s = "Episode reward: {:.1f}\tSteps: {}\t".format(
            episode_reward,
            step_ctr,
        )
        # Print our log message
        print(s)
        # If we have a log file, print it there as well
        if log_file is not None:
            log_file.write(s + "\n")

    env.close()