Exemplo n.º 1
0
def main():

    env = gym.make('PongDeterministic-v4')
    network_input_shape = (4, 210, 160
                           )  # Dimension ordering: 'th' (channels first)
    actions = env.action_space.n

    agent = DQN_CNN(actions,
                    network_input_shape,
                    learning_rate=LEARNING_RATE,
                    discount_factor=DISCOUNT_FACTOR)
    frame_counter = 0

    for ep in range(1, EPISODES + 1):
        # Start episode
        score = 0
        # Observe reward and initialize first state
        obs = preprocess_observation(env.reset())

        # Initialize the first state with the same 4 images
        current_state = np.array([obs, obs, obs, obs])
        frame_counter += 1
        for t in range(MAX_STEPS):
            # Select an action using the DQA
            action = agent.act(np.asarray([current_state]))
            pdb.set_trace()
            # Observe reward and next state
            obs, reward, done, info = env.step(action)
            obs = preprocess_observation(obs)
            next_state = get_next_state(current_state, obs)
            frame_counter += 1
            # Store transition in replay memory
            clipped_reward = np.clip(reward, -1, 1)  # Clip the reward
            agent.add_experience(np.asarray([current_state]),
                                 action, clipped_reward,
                                 np.asarray([next_state]), done)
Exemplo n.º 2
0
            # Stop the episode if it takes too long
            if frame_counter > args.max_frames_number:
                DQA.quit()

            # Select an action using the DQA
            action = DQA.get_action(np.asarray([current_state]))

            # Observe reward and next state
            obs, reward, done, info = env.step(action)
            # Render the game
            if args.video:
                imshow('state', obs)
                waitKey(1)
                #vid.write(state)
            obs = utils.preprocess_observation(obs)
            next_state = utils.get_next_state(current_state, obs)

            frame_counter += 1

            # Store transition in replay memory
            clipped_reward = np.clip(reward, -1, 1)  # Clip the reward
            DQA.add_experience(np.asarray([current_state]), action,
                               clipped_reward, np.asarray([next_state]), done)

            # Train the agent
            if t % args.update_freq == 0 and len(
                    DQA.experiences) >= args.replay_start_size:
                DQA.train()
                # Every C DQN updates, update DQN_target
                if DQA.training_count % args.target_network_update_freq == 0 and DQA.training_count >= args.target_network_update_freq:
                    DQA.reset_target_network()
Exemplo n.º 3
0
def evaluate(DQA, args, logger, env):
    global max_mean_score

    evaluation_csv = 'evaluation.csv'
    logger.to_csv(evaluation_csv, 'length,score')
    
    scores = list()
    frame_counter = 0

    while frame_counter < args.validation_frames:
        remaining_random_actions = args.initial_random_actions
        obs = utils.preprocess_observation(env.reset())

        frame_counter += 1
        # Initialize the first state with the same 4 images
        current_state = np.array([obs, obs, obs, obs])
        t = 0
        episode = 0
        score = 0

        # Start episode
        while True:
            # Render the game if video output is not suppressed
            if args.video:
                env.render()

            action = DQA.get_action(np.asarray([current_state]),
                                    testing=True,
                                    force_random=remaining_random_actions > 0)
            obs, reward, done, info = env.step(action)
            obs = utils.preprocess_observation(obs)
            current_state = utils.get_next_state(current_state, obs)

            if remaining_random_actions > 0:
                remaining_random_actions -= 1

            score += reward
            t += 1
            frame_counter += 1

            # End episode
            if done or t > args.max_episode_length:
                episode += 1
                print('Episode %d end\n---------------\nFrame counter: %d\n' % (episode, frame_counter))
                print('Length: %d, Score: %.1f\n\n' % (t, score))
                # Save episode data in the evaluation csv
                logger.to_csv(evaluation_csv, [t, score])
                break
                
        scores.append([t, score])

    scores = np.asarray(scores)
    max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel()
    max_idx = np.random.choice(max_indices)

    # Save best model
    if max_mean_score < np.mean(scores):
        max_mean_score = np.mean(scores)
        DQA.DQN.save(append='_best')

    return scores[max_idx, :].ravel()
Exemplo n.º 4
0
def evaluate(DQA, args, logger):
    global max_mean_score

    evaluation_csv = 'evaluation.csv'
    logger.to_csv(evaluation_csv, 'length,score')
    env = gym.make(args.environment)
    scores = list()
    frame_counter = 0

    while frame_counter < args.validation_frames:
        remaining_random_actions = args.initial_random_actions
        obs = utils.preprocess_observation(env.reset())

        frame_counter += 1
        # Initialize the first state with the same 4 images
        current_state = np.array([obs, obs, obs, obs])
        t = 0
        episode = 0
        score = 0

        # Start episode
        while True:
            # Render the game if video output is not suppressed
            if args.video:
                env.render()

            action = DQA.get_action(np.asarray([current_state]),
                                    testing=True,
                                    force_random=remaining_random_actions > 0)
            obs, reward, done, info = env.step(action)
            obs = utils.preprocess_observation(obs)
            current_state = utils.get_next_state(current_state, obs)

            if remaining_random_actions > 0:
                remaining_random_actions -= 1

            score += reward
            t += 1
            frame_counter += 1

            # End episode
            if done or t > args.max_episode_length:
                episode += 1
                print('Episode %d end\n---------------\nFrame counter: %d\n' % 
                      (episode, frame_counter))
                print('Length: %d\n, Score: %f\n\n' % (t, score))
                # Save episode data in the evaluation csv
                logger.to_csv(evaluation_csv, [t, score])
                break
                
        scores.append([t, score])

    scores = np.asarray(scores)
    max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel()
    max_idx = np.random.choice(max_indices)

    # Save best model
    if max_mean_score < np.mean(scores):
        max_mean_score = np.mean(scores)
        DQA.DQN.save(append='_best')

    return scores[max_idx, :].ravel()
Exemplo n.º 5
0
        while t < args.max_episode_length:
            # Stop the episode if it takes too long
            if frame_counter > args.max_frames_number:
                DQA.quit()

            # Render the game
            if args.video:
                env.render()

            # Select an action using the DQA
            action = DQA.get_action(np.asarray([current_state]))

            # Observe reward and next state
            obs, reward, done, info = env.step(action)
            obs = utils.preprocess_observation(obs)
            next_state = utils.get_next_state(current_state, obs)

            frame_counter += 1

            # Store transition in replay memory
            clipped_reward = np.clip(reward, -1, 1)  # Clip the reward
            DQA.add_experience(np.asarray([current_state]),
                               action,
                               clipped_reward,
                               np.asarray([next_state]),
                               done)

            # Train the agent
            if t % args.update_freq == 0 and len(DQA.experiences) >= args.replay_start_size:
                DQA.train()
                # Every C DQN updates, update DQN_target