示例#1
0
 def memorize(self, state, action, reward, next_state, done):
     mem = self.memory.get(reward)
     if mem is None:
         mem = deque(maxlen=10 * self.batch_size * self.model_instances)
     mem.append(
         (self.preprocess_state(state), encode_action(action), reward,
          self.preprocess_state(next_state), done))
     self.memory[reward] = mem
     self.visits[state, encode_action(action)] += 1
示例#2
0
def play_random(board, save_normalized_matrix=True):

    steps = []
    render_board(board)

    while True:

        #Exit if needed by pressing red cross
        for event in pygame.event.get():
            if event.type == QUIT:
                pygame.quit()
                sys.exit()

        #Playing randomly
        r = np.random.RandomState()
        action = r.choice(list(range(GameEnv.NB_ACTIONS)))  #Select a random action
        moved = board.move(action)
        matrix = board.normalized_matrix if save_normalized_matrix else board.matrix

        if moved:
            print()
            print(board.matrix)
            print("SCORE:", board.score, "\tSTEP:", board.n_steps_valid, "\tHIGHEST VALUE:", board.highest_value)
            steps.append(Step(matrix=matrix, action=action, action_encoded=encode_action(action)))
            render_board(board)

            if board.is_gameover():
                print("GAME OVER!")
                return Game(steps=steps, score=board.score, random_seed=board.random_seed, is_gameover=True)            

        clock.tick(5)
        pygame.display.flip()
示例#3
0
 def train(self, state, action, reward, next_state):
     future_reward = reward + self.gamma * np.max(self.Qmean[next_state, :])
     encoded_action = encode_action(action)
     # update mean, sum squared rewards and variance in exact order
     self.update_mean(state, encoded_action, future_reward)
     self.update_sum_squared_rewards(state, encoded_action, future_reward)
     self.update_variance(state, encoded_action)
     self.visits[state, encoded_action] += 1
    def generate_output_states(self, input_state):
        next_states = []
        # Generating next states states using autoencoder
        for i in range(self.action_dim):
            ohe_action = encode_action(self.action_dim, i)
            ohe_action = np.expand_dims(ohe_action, axis=0)

            predicted_next = self.predict(input_state, ohe_action)
            predicted_next = (predicted_next[0, :, :, :] * 255.).astype(
                np.uint8)
            next_states.append(preprocess_frame_bw_next_state(predicted_next))

        return np.stack(next_states, axis=2)
def generate_agent_episodes(args):

    full_path = ROLLOUT_DIR + '/rollout_' + args.env_name

    if not os.path.exists(full_path):
        os.umask(0o000)
        os.makedirs(full_path)

    env_name = args.env_name
    total_episodes = args.total_episodes
    time_steps = args.time_steps

    envs_to_generate = [env_name]

    for current_env_name in envs_to_generate:
        print("Generating data for env {}".format(current_env_name))

        env = gym.make(current_env_name)  # Create the environment
        env.seed(0)

        # First load the DQN agent and the predictive auto encoder with their weights
        agent = Agent(gamma=0.99,
                      epsilon=0.0,
                      alpha=0.0001,
                      input_dims=(104, 80, 4),
                      n_actions=env.action_space.n,
                      mem_size=25000,
                      eps_min=0.0,
                      batch_size=32,
                      replace=1000,
                      eps_dec=1e-5,
                      env_name=current_env_name)
        agent.load_models()

        predictor = load_predictive_model(current_env_name, env.action_space.n)

        s = 0

        while s < total_episodes:

            rollout_file = os.path.join(full_path, 'rollout-%d.npz' % s)

            observation = env.reset()
            frame_queue = deque(maxlen=4)
            dqn_queue = deque(maxlen=4)

            t = 0

            next_state_sequence = []
            correct_state_sequence = []
            total_reward = 0
            while t < time_steps:
                # preprocess frames for predictive model and dqn
                converted_obs = preprocess_frame(observation)
                converted_obs_dqn = preprocess_frame_dqn(observation)

                if t == 0:
                    for i in range(4):
                        frame_queue.append(converted_obs)
                        dqn_queue.append(converted_obs_dqn)
                else:
                    frame_queue.pop()
                    dqn_queue.pop()
                    frame_queue.appendleft(converted_obs)
                    dqn_queue.appendleft(converted_obs_dqn)

                observation_states = np.concatenate(frame_queue, axis=2)
                dqn_states = np.concatenate(dqn_queue, axis=2)
                next_states = predictor.generate_output_states(
                    np.expand_dims(observation_states, axis=0))
                next_state_sequence.append(next_states)
                action = agent.choose_action(dqn_states)
                correct_state_sequence.append(
                    encode_action(env.action_space.n, action))

                observation, reward, done, info = env.step(
                    action)  # Take a random action
                total_reward += reward
                t = t + 1

            print(
                "Episode {} finished after {} timesteps with reward {}".format(
                    s, t, total_reward))

            np.savez_compressed(rollout_file,
                                next=next_state_sequence,
                                correct=correct_state_sequence)

            s = s + 1

        env.close()
def main(args):

    env_name = args.env_name
    total_episodes = args.total_episodes
    time_steps = args.time_steps
    informed = args.informed
    # action_refresh_rate = args.action_refresh_rate

    if informed:
        full_path = ROLLOUT_DIR + '/informed_rollout_' + args.env_name
    else:
        full_path = ROLLOUT_DIR + '/random_rollout_' + args.env_name

    if not os.path.exists(full_path):
        os.umask(0o000)
        os.makedirs(full_path)

    envs_to_generate = [env_name]

    for current_env_name in envs_to_generate:
        print("Generating data for env {}".format(current_env_name))

        env = gym.make(current_env_name)  # Create the environment
        env.seed(0)

        s = 0

        if informed:
            agent = load_dqn(env)

        while s < total_episodes:

            rollout_file = os.path.join(full_path, 'rollout-%d.npz' % s)

            observation = env.reset()
            frame_queue = deque(maxlen=4)
            dqn_queue = deque(maxlen=4)

            t = 0

            obs_sequence = []
            action_sequence = []
            next_sequence = []

            while t < time_steps:

                # convert image to greyscale, downsize
                converted_obs = preprocess_frame(observation)

                if t == 0:
                    for i in range(4):
                        frame_queue.append(converted_obs)
                else:
                    frame_queue.pop()
                    frame_queue.appendleft(converted_obs)

                stacked_state = np.concatenate(frame_queue, axis=2)
                obs_sequence.append(stacked_state)

                if informed:
                    dqn_obs = preprocess_frame_dqn(observation)
                    if t == 0:
                        for i in range(4):
                            dqn_queue.append(dqn_obs)
                    else:
                        dqn_queue.pop()
                        dqn_queue.appendleft(dqn_obs)
                    stacked = np.concatenate(dqn_queue, axis=2)
                    action = agent.choose_action(stacked)
                else:
                    action = env.action_space.sample()

                action_sequence.append(
                    encode_action(env.action_space.n, action))

                observation, _, _, _ = env.step(action)  # Take a random action
                t = t + 1

                next_sequence.append(preprocess_frame(observation))

            print("Episode {} finished after {} timesteps".format(s, t))

            np.savez_compressed(rollout_file,
                                obs=obs_sequence,
                                actions=action_sequence,
                                next_frame=next_sequence)

            s = s + 1

        env.close()
示例#7
0
def play(board, save_normalized_matrix=True):
    """
    Parameters
    ----------
    board : numpy.array
    save_normalized_matrix : bool
        Whether to save normalized (log2 transformed) or original matrix.

    Returns
    -------
    collections.namedtuple
        Game with recorded steps.
    """

    steps = []
    render_board(board)

    while True:
        for event in pygame.event.get():
            if event.type == QUIT:
                pygame.quit()
                sys.exit()
            if event.type == pygame.KEYDOWN:
                if event.key in POSSIBLE_ACTIONS:
                    matrix = board.normalized_matrix if save_normalized_matrix else board.matrix
                    action = POSSIBLE_ACTIONS[event.key]
                    moved = board.move(action) #boolean

                    if moved:
                        print()
                        print(board.matrix)
                        print("SCORE:", board.score, "\tSTEP:", board.n_steps_valid, "\tHIGHEST VALUE:", board.highest_value)
                        steps.append(Step(matrix=matrix, action=action, action_encoded=encode_action(action)))
                        render_board(board)

                        if board.is_gameover():
                            print("GAME OVER!")
                            return Game(steps=steps, score=board.score, random_seed=board.random_seed, is_gameover=True)
                    else:
                        print("\nCannot move to this direction!")
                elif event.key == pygame.K_q:
                    screen.fill(BLACK)
                    return Game(steps=steps, random_seed=board.random_seed, is_gameover=False)
                elif event.key == pygame.K_p:
                    screen.fill(BLACK)
                    return "quit"

        clock.tick(60)
        pygame.display.flip()
示例#8
0
                feed_dict={X: get_state(state, env.observation_space.n)})
            # decode actions
            actions = [decode_action(a) for a in actions]
            # epsilon-greedy action
            if np.random.rand(1) < epsilon:
                actions[0] = env.action_space.sample()
            # get the new state
            next_state, reward, done, _ = env.step(actions[0])
            # obtain the next Q values by feeding the state through the network
            predNextQ = sess.run(
                Qout,
                feed_dict={X: get_state(next_state, env.observation_space.n)})
            targetQ = allQ
            targetQ[
                0,
                encode_action(actions[0])] = reward + gamma * np.max(predNextQ)
            # train the networkd using the target and predicted Q values
            sess.run(updateModel,
                     feed_dict={
                         X: get_state(state, env.observation_space.n),
                         nextQ: targetQ
                     })
            rewards.append(reward)
            state = next_state

            if done and epsilon > 0.01 and np.mean(
                    rewards) > 0 and episode >= 1000:
                # reduce the chance of random action as we train the model
                epsilon = 1. / (episode / 50 + 10)

        performance.append(np.sum(rewards))