Пример #1
0
    def __init__(self,
                 pool,
                 observation_shape,
                 n_actions,
                 n_parallel_games=1,
                 replay_seq_len=20,
                 replay_batch_size=20,
                 pool_size=None,
                 n_steps=3,
                 gamma=0.99):
        """
          :type n_parallel_games: int
                n_actions: int
          """
        # Parameters for training

        self.n_parallel_games = n_parallel_games
        self.replay_seq_len = replay_seq_len
        self.replay_batch_size = replay_batch_size
        self.pool_size = pool_size
        self.n_steps = n_steps
        self.gamma = gamma
        self.loss = None

        # image observation
        self.observation_layer = InputLayer(observation_shape)
        self.n_actions = n_actions
        self.resolver, self.agent = self.build_model()

        weights = lasagne.layers.get_all_params(self.resolver, trainable=True)
        self.applier_fun = self.agent.get_react_function()

        # Prepare replay pool
        env = SessionPoolEnvironment(observations=self.observation_layer,
                                     actions=self.resolver,
                                     agent_memories=self.agent.state_variables)

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = \
            pool.interact(self.step, n_steps=self.replay_seq_len)
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

        if pool_size is None:
            batch_env = env
        else:
            batch_env = env.sample_session_batch(self.replay_batch_size)

        self.loss = self.build_loss(batch_env)
        self.eval_fun = self.build_eval_fun(batch_env)

        updates = lasagne.updates.adadelta(self.loss,
                                           weights,
                                           learning_rate=0.01)
        train_fun = theano.function([], [self.loss], updates=updates)
        super(BasicRLAgent, self).__init__(env, pool, train_fun, pool_size,
                                           replay_seq_len)
Пример #2
0
    def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1,
                 replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99):
        """
          :type n_parallel_games: int
                n_actions: int
          """
        # Parameters for training

        self.n_parallel_games = n_parallel_games
        self.replay_seq_len = replay_seq_len
        self.replay_batch_size = replay_batch_size
        self.pool_size = pool_size
        self.n_steps = n_steps
        self.gamma = gamma
        self.loss = None

        # image observation
        self.observation_layer = InputLayer(observation_shape)
        self.n_actions = n_actions
        self.resolver, self.agent = self.build_model()

        weights = lasagne.layers.get_all_params(self.resolver, trainable=True)
        self.applier_fun = self.agent.get_react_function()

        # Prepare replay pool
        env = SessionPoolEnvironment(observations=self.observation_layer,
                                     actions=self.resolver,
                                     agent_memories=self.agent.state_variables)

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = \
            pool.interact(self.step, n_steps=self.replay_seq_len)
        env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor,
                          preceding_memory_states)

        if pool_size is None:
            batch_env = env
        else:
            batch_env = env.sample_session_batch(self.replay_batch_size)

        self.loss = self.build_loss(batch_env)
        self.eval_fun = self.build_eval_fun(batch_env)

        updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01)
        train_fun = theano.function([], [self.loss], updates=updates)
        super(BasicRLAgent, self).__init__(env, pool, train_fun, pool_size, replay_seq_len)
Пример #3
0
    def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1,
                 replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99,
                 split_into=1,): #gru0_size=128):
        self.n_parallel_games = n_parallel_games
        self.replay_seq_len = replay_seq_len
        self.replay_batch_size = replay_batch_size
        self.pool_size = pool_size
        self.n_steps = n_steps
        self.n_actions = n_actions
        self.gamma = gamma
        self.split_into = split_into
        self.controller = Controller(observation_shape, n_actions)
        self.metacontroller = MetaController(self.controller)#, gru0_size)

        # Prepare replay pool
        self.controller_env = SessionPoolEnvironment(observations=self.controller.agent.observation_layers,
                                                     actions=self.controller.resolver,
                                                     agent_memories=self.controller.agent.agent_states)
        self.metacontroller_env = SessionPoolEnvironment(observations=self.metacontroller.agent.observation_layers,
                                                         actions=self.metacontroller.resolver,
                                                         agent_memories=self.metacontroller.agent.agent_states)

        # get interaction sessions
        observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \
            pool.interact(self.step, n_steps=self.replay_seq_len)
        preceding_memory_states = list(pool.prev_memory_states)
        self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor,
                         memory_log, preceding_memory_states)

        if pool_size is None:
            controller_batch_env = self.controller_env
            metacontroller_batch_env = self.metacontroller_env
        else:
            controller_batch_env = self.controller_env.sample_session_batch(self.replay_batch_size)
            metacontroller_batch_env = self.metacontroller_env.sample_session_batch(self.replay_batch_size)

        self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \
                    self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10)
        self.eval_fun = self.build_eval_fun(metacontroller_batch_env)

        weights = self.controller.weights + self.metacontroller.weights
        updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01)
        mean_session_reward = metacontroller_batch_env.rewards.sum(axis=1).mean()
        train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates)
        super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env],
                                                pool, train_fun, pool_size, replay_seq_len)
Пример #4
0
    def __init__(self, agent, game_title, n_games, max_size=None, **kwargs):
        """
        A pool that stores several
           - game states (gym environment)
           - prev_observations - last agent observations
           - prev memory states - last agent hidden states

        :param game_title: name of the game. See here http://yavar.naddaf.name/ale/list_of_current_games.html
        :param n_games: number of parallel games
        :param kwargs: options passed to Atari when creating a game. See Atari __init__
        """
        #create atari games
        self.game_kwargs = kwargs
        self.game_title = game_title
        self.games = [
            Atari(self.game_title, **self.game_kwargs) for _ in range(n_games)
        ]

        #initial observations
        self.prev_observations = [atari.reset() for atari in self.games]

        #agent memory variables (if you use recurrent networks
        self.prev_memory_states = [
            np.zeros((n_games, ) + tuple(mem.output_shape[1:]),
                     dtype=get_layer_dtype(mem)) for mem in agent.agent_states
        ]

        #save agent
        self.agent = agent
        self.agent_step = agent.get_react_function()

        # Create experience replay environment
        self.experience_replay = SessionPoolEnvironment(
            observations=agent.observation_layers,
            actions=agent.action_layers,
            agent_memories=agent.agent_states)
        self.max_size = max_size
Пример #5
0
def test_space_invaders(
    game_title='SpaceInvaders-v0',
    n_parallel_games=3,
    replay_seq_len=2,
):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None, ) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    window_size = 3

    # prev state input
    prev_window = InputLayer(
        (None, window_size) + tuple(observation_reshape.output_shape[1:]),
        name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    memory_dict = {window: prev_window}

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None, ) +
                                 window.output_shape[2:])

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(window_max, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    #fakes for a2c
    policy_eval = DenseLayer(nn,
                             num_units=n_actions,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             name="a2c action probas")
    state_value_eval = DenseLayer(nn,
                                  num_units=1,
                                  nonlinearity=None,
                                  name="a2c state values")
    # resolver
    resolver = ProbabilisticResolver(policy_eval, name="resolver")

    # agent
    agent = Agent(observation_layer, memory_dict,
                  (q_eval, policy_eval, state_value_eval), resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    print('compiling react')
    applier_fun = agent.get_react_function()

    # a nice pythonic interface
    def step(observation, prev_memories='zeros', batch_size=n_parallel_games):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros
        if prev_memories == 'zeros':
            prev_memories = [
                np.zeros((batch_size, ) + tuple(mem.output_shape[1:]),
                         dtype='float32') for mem in agent.agent_states
            ]
        res = applier_fun(np.array(observation), *prev_memories)
        action = res[0]
        memories = res[1:]
        return action, memories

    # # Create and manage a pool of atari sessions to play with

    pool = GamePool(game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(
            step, n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values, policy, etc obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )
    (q_values_sequence, policy_sequence, value_sequence) = estimators

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = 0.

    #1-step algos
    for algo in qlearning, sarsa:
        elwise_mse_loss += algo.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
        )
    #qlearning_n_step
    for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1,
              None):
        elwise_mse_loss += qlearning_n_step.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
            n_steps=n)

    #a2c n_step

    elwise_mse_loss += a2c_n_step.get_elementwise_objective(
        policy_sequence,
        value_sequence[:, :, 0],
        env.actions[0],
        scaled_reward_seq,
        env.is_alive,
        gamma_or_gammas=0.99,
        n_steps=3)

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward],
                                updates=updates)
    evaluation_fun = theano.function(
        [], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " %
              (epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Пример #6
0
    def __init__(
        self,
        pool,
        observation_shape,
        n_actions,
        n_parallel_games=1,
        replay_seq_len=20,
        replay_batch_size=20,
        pool_size=None,
        n_steps=3,
        gamma=0.99,
        split_into=1,
    ):  #gru0_size=128):
        self.n_parallel_games = n_parallel_games
        self.replay_seq_len = replay_seq_len
        self.replay_batch_size = replay_batch_size
        self.pool_size = pool_size
        self.n_steps = n_steps
        self.n_actions = n_actions
        self.gamma = gamma
        self.split_into = split_into
        self.controller = Controller(observation_shape, n_actions)
        self.metacontroller = MetaController(self.controller)  #, gru0_size)

        # Prepare replay pool
        self.controller_env = SessionPoolEnvironment(
            observations=self.controller.agent.observation_layers,
            actions=self.controller.resolver,
            agent_memories=self.controller.agent.agent_states)
        self.metacontroller_env = SessionPoolEnvironment(
            observations=self.metacontroller.agent.observation_layers,
            actions=self.metacontroller.resolver,
            agent_memories=self.metacontroller.agent.agent_states)

        # get interaction sessions
        observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \
            pool.interact(self.step, n_steps=self.replay_seq_len)
        preceding_memory_states = list(pool.prev_memory_states)
        self.reload_pool(observation_log, action_tensor, extrinsic_reward_log,
                         is_alive_tensor, memory_log, preceding_memory_states)

        if pool_size is None:
            controller_batch_env = self.controller_env
            metacontroller_batch_env = self.metacontroller_env
        else:
            controller_batch_env = self.controller_env.sample_session_batch(
                self.replay_batch_size)
            metacontroller_batch_env = self.metacontroller_env.sample_session_batch(
                self.replay_batch_size)

        self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \
                    self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10)
        self.eval_fun = self.build_eval_fun(metacontroller_batch_env)

        weights = self.controller.weights + self.metacontroller.weights
        updates = lasagne.updates.adadelta(self.loss,
                                           weights,
                                           learning_rate=0.01)
        mean_session_reward = metacontroller_batch_env.rewards.sum(
            axis=1).mean()
        train_fun = theano.function([], [self.loss, mean_session_reward],
                                    updates=updates)
        super(HierarchicalAgent,
              self).__init__([self.controller_env, self.metacontroller_env],
                             pool, train_fun, pool_size, replay_seq_len)
Пример #7
0
class HierarchicalAgent(MdpAgent):
    def __init__(
        self,
        pool,
        observation_shape,
        n_actions,
        n_parallel_games=1,
        replay_seq_len=20,
        replay_batch_size=20,
        pool_size=None,
        n_steps=3,
        gamma=0.99,
        split_into=1,
    ):  #gru0_size=128):
        self.n_parallel_games = n_parallel_games
        self.replay_seq_len = replay_seq_len
        self.replay_batch_size = replay_batch_size
        self.pool_size = pool_size
        self.n_steps = n_steps
        self.n_actions = n_actions
        self.gamma = gamma
        self.split_into = split_into
        self.controller = Controller(observation_shape, n_actions)
        self.metacontroller = MetaController(self.controller)  #, gru0_size)

        # Prepare replay pool
        self.controller_env = SessionPoolEnvironment(
            observations=self.controller.agent.observation_layers,
            actions=self.controller.resolver,
            agent_memories=self.controller.agent.agent_states)
        self.metacontroller_env = SessionPoolEnvironment(
            observations=self.metacontroller.agent.observation_layers,
            actions=self.metacontroller.resolver,
            agent_memories=self.metacontroller.agent.agent_states)

        # get interaction sessions
        observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \
            pool.interact(self.step, n_steps=self.replay_seq_len)
        preceding_memory_states = list(pool.prev_memory_states)
        self.reload_pool(observation_log, action_tensor, extrinsic_reward_log,
                         is_alive_tensor, memory_log, preceding_memory_states)

        if pool_size is None:
            controller_batch_env = self.controller_env
            metacontroller_batch_env = self.metacontroller_env
        else:
            controller_batch_env = self.controller_env.sample_session_batch(
                self.replay_batch_size)
            metacontroller_batch_env = self.metacontroller_env.sample_session_batch(
                self.replay_batch_size)

        self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \
                    self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10)
        self.eval_fun = self.build_eval_fun(metacontroller_batch_env)

        weights = self.controller.weights + self.metacontroller.weights
        updates = lasagne.updates.adadelta(self.loss,
                                           weights,
                                           learning_rate=0.01)
        mean_session_reward = metacontroller_batch_env.rewards.sum(
            axis=1).mean()
        train_fun = theano.function([], [self.loss, mean_session_reward],
                                    updates=updates)
        super(HierarchicalAgent,
              self).__init__([self.controller_env, self.metacontroller_env],
                             pool, train_fun, pool_size, replay_seq_len)
        # raise NotImplementedError

    def reload_pool(self, observation_tensor, action_tensor,
                    extrinsic_reward_tensor, is_alive_tensor, memory_tensor,
                    preceding_memory_states):
        batch_size = observation_tensor.shape[0]
        # print observation_tensor.shape
        meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:]
        itr = itrs[0]

        pivot = len(self.controller.agent.state_variables)
        controller_preceding_states = preceding_memory_states[:pivot]
        metacontroller_preceding_states = preceding_memory_states[pivot:-4]

        ###CONTROLLER###
        # load them into experience replay environment for controller

        # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!!
        ctrl_shape = (batch_size * self.split_into,
                      self.replay_seq_len / self.split_into)

        intrinsic_rewards = np.concatenate(
            [np.zeros([meta_V.shape[0], 1]),
             np.diff(meta_V, axis=1)], axis=1)
        # print [observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]),
        #                                  goal_log.reshape(ctrl_shape)][0].shape
        self.controller_env.load_sessions(
            [
                observation_tensor.reshape(
                    ctrl_shape + self.controller.observation_shape[1:]),
                goal_log.reshape(ctrl_shape)
            ],
            action_tensor.reshape(ctrl_shape),
            intrinsic_rewards.reshape(ctrl_shape),
            is_alive_tensor.reshape(ctrl_shape),
            # controller_preceding_states
        )

        ###METACONTROLLER###
        # separate case for metacontroller
        extrinsic_reward_sums = np.diff(
            np.concatenate([
                np.zeros_like(extrinsic_reward_tensor[:, 0, None]),
                extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]
            ],
                           axis=1))

        self.metacontroller_env.load_sessions(
            meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10],
            extrinsic_reward_sums[:, :10], is_alive_tensor[:,
                                                           itr == 0][:, :10],
            metacontroller_preceding_states)

    def update_pool(self, observation_tensor, action_tensor,
                    extrinsic_reward_tensor, is_alive_tensor, memory_tensor,
                    preceding_memory_states):
        batch_size = observation_tensor.shape[0]
        meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:]
        itr = itrs[0]

        pivot = len(self.controller.agent.state_variables)
        controller_preceding_states = preceding_memory_states[:pivot]
        metacontroller_preceding_states = preceding_memory_states[pivot:-4]

        ###CONTROLLER###
        # load them into experience replay environment for controller

        # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!!
        ctrl_shape = (batch_size * self.split_into,
                      self.replay_seq_len / self.split_into)

        intrinsic_rewards = np.concatenate(
            [np.zeros([meta_V.shape[0], 1]),
             np.diff(meta_V, axis=1)], axis=1)
        self.controller_env.append_sessions(
            [
                observation_tensor.reshape(
                    ctrl_shape + self.controller.observation_shape[1:]),
                goal_log.reshape(ctrl_shape)
            ],
            action_tensor.reshape(ctrl_shape),
            intrinsic_rewards.reshape(ctrl_shape),
            is_alive_tensor.reshape(ctrl_shape),
            controller_preceding_states,
            max_pool_size=self.pool_size,
        )

        ###METACONTROLLER###
        # separate case for metacontroller
        extrinsic_reward_sums = np.diff(
            np.concatenate([
                np.zeros_like(extrinsic_reward_tensor[:, 0, None]),
                extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]
            ],
                           axis=1))

        self.metacontroller_env.append_sessions(
            meta_obs_log[:, itr == 0][:, :10],
            goal_log[:, itr == 0][:, :10],
            extrinsic_reward_sums[:, :10],
            is_alive_tensor[:, itr == 0][:, :10],
            metacontroller_preceding_states,
            max_pool_size=self.pool_size)

    def step(self, env_observation, prev_memories='zeros'):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""

        batch_size = self.n_parallel_games

        if prev_memories == 'zeros':
            controller_mem = metacontroller_mem = 'zeros'
            meta_inp = np.zeros(
                (batch_size, ) +
                tuple(self.metacontroller.observation_shape[1:]),
                dtype='float32')
            itr = -1
            # goal will be defined by "if itr ==0" clause
        else:
            pivot = len(self.controller.agent.state_variables)
            controller_mem, metacontroller_mem = prev_memories[:
                                                               pivot], prev_memories[
                                                                   pivot:-4]
            meta_inp, goal, meta_V, itrs = prev_memories[-4:]
            itr = itrs[0]

        itr = (itr + 1) % self.metacontroller.period

        if itr == 0:
            goal, metacontroller_mem, meta_V = self.metacontroller.step(
                meta_inp, metacontroller_mem, batch_size)

        #print env_observation.shape
        action, controller_mem, meta_inp = self.controller.step(
            env_observation, goal, controller_mem, batch_size)

        new_memories = controller_mem + metacontroller_mem + [
            meta_inp, goal, meta_V, [itr] * self.n_parallel_games
        ]

        return action, new_memories

    def build_loss(self, env, agent, replay_seq_len):
        # get agent's Qvalues obtained via experience replay
        _, _, _, _, qvalues_seq = agent.get_sessions(
            env,
            # initial_hidden = env.preceding_agent_memories,
            session_length=replay_seq_len,
            batch_size=env.batch_size,
            optimize_experience_replay=True,
        )

        scaled_reward_seq = env.rewards

        elwise_mse_loss = qlearning_n_step.get_elementwise_objective(
            qvalues_seq,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=self.gamma,
            n_steps=self.n_steps)

        # compute mean over "alive" fragments
        mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

        # regularize network weights

        reg_l2 = regularize_network_params(agent.state_variables.keys(),
                                           l2) * 10**-5

        return mse_loss + reg_l2

    def build_eval_fun(self, env):

        mean_session_reward = env.rewards.sum(
            axis=1).mean() / self.replay_seq_len

        eval_fun = theano.function([], [mean_session_reward])
        return eval_fun
Пример #8
0
def test_memory(
    game_title='SpaceInvaders-v0',
    n_parallel_games=3,
    replay_seq_len=2,
):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None, ) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states

    memory_dict = OrderedDict([])

    ###Window
    window_size = 3

    # prev state input
    prev_window = InputLayer(
        (None, window_size) + tuple(observation_reshape.output_shape[1:]),
        name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None, ) +
                                 window.output_shape[2:])

    memory_dict[window] = prev_window

    ###Stack
    #prev stack
    stack_w, stack_h = 4, 5
    stack_inputs = DenseLayer(observation_reshape, stack_w, name="prev_stack")
    stack_controls = DenseLayer(observation_reshape,
                                3,
                                nonlinearity=lasagne.nonlinearities.softmax,
                                name="prev_stack")
    prev_stack = InputLayer((None, stack_h, stack_w),
                            name="previous stack state")
    stack = StackAugmentation(stack_inputs, prev_stack, stack_controls)
    memory_dict[stack] = prev_stack

    stack_top = lasagne.layers.SliceLayer(stack, 0, 1)

    ###RNN preset

    prev_rnn = InputLayer((None, 16), name="previous RNN state")
    new_rnn = RNNCell(prev_rnn, observation_reshape)
    memory_dict[new_rnn] = prev_rnn

    ###GRU preset
    prev_gru = InputLayer((None, 16), name="previous GRUcell state")
    new_gru = GRUCell(prev_gru, observation_reshape)
    memory_dict[new_gru] = prev_gru

    ###GRUmemorylayer
    prev_gru1 = InputLayer((None, 15), name="previous GRUcell state")
    new_gru1 = GRUMemoryLayer(15, observation_reshape, prev_gru1)
    memory_dict[new_gru1] = prev_gru1

    #LSTM with peepholes
    prev_lstm0_cell = InputLayer(
        (None, 13), name="previous LSTMCell hidden state [with peepholes]")

    prev_lstm0_out = InputLayer(
        (None, 13), name="previous LSTMCell output state [with peepholes]")

    new_lstm0_cell, new_lstm0_out = LSTMCell(
        prev_lstm0_cell,
        prev_lstm0_out,
        input_or_inputs=observation_reshape,
        peepholes=True,
        name="newLSTM1 [with peepholes]")

    memory_dict[new_lstm0_cell] = prev_lstm0_cell
    memory_dict[new_lstm0_out] = prev_lstm0_out

    #LSTM without peepholes
    prev_lstm1_cell = InputLayer(
        (None, 14), name="previous LSTMCell hidden state [no peepholes]")

    prev_lstm1_out = InputLayer(
        (None, 14), name="previous LSTMCell output state [no peepholes]")

    new_lstm1_cell, new_lstm1_out = LSTMCell(
        prev_lstm1_cell,
        prev_lstm1_out,
        input_or_inputs=observation_reshape,
        peepholes=False,
        name="newLSTM1 [no peepholes]")

    memory_dict[new_lstm1_cell] = prev_lstm1_cell
    memory_dict[new_lstm1_out] = prev_lstm1_out

    ##concat everything

    for i in [flatten(window_max), stack_top, new_rnn, new_gru, new_gru1]:
        print(i.output_shape)
    all_memory = concat([
        flatten(window_max),
        stack_top,
        new_rnn,
        new_gru,
        new_gru1,
        new_lstm0_out,
        new_lstm1_out,
    ])

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(all_memory, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    # resolver
    resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver")

    # agent
    agent = Agent(observation_layer, memory_dict, q_eval, resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    print('compiling react')
    applier_fun = agent.get_react_function()

    # a nice pythonic interface
    def step(observation, prev_memories='zeros', batch_size=n_parallel_games):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros
        if prev_memories == 'zeros':
            prev_memories = [
                np.zeros((batch_size, ) + tuple(mem.output_shape[1:]),
                         dtype='float32') for mem in agent.agent_states
            ]
        res = applier_fun(np.array(observation), *prev_memories)
        action = res[0]
        memories = res[1:]
        return action, memories

    # # Create and manage a pool of atari sessions to play with

    pool = GamePool(game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(
            step, n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = qlearning.get_elementwise_objective(
        q_values_sequence,
        env.actions[0],
        scaled_reward_seq,
        env.is_alive,
        gamma_or_gammas=0.99,
    )

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward],
                                updates=updates)
    evaluation_fun = theano.function(
        [], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " %
              (epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Пример #9
0
class HierarchicalAgent(MdpAgent):
    def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1,
                 replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99,
                 split_into=1,): #gru0_size=128):
        self.n_parallel_games = n_parallel_games
        self.replay_seq_len = replay_seq_len
        self.replay_batch_size = replay_batch_size
        self.pool_size = pool_size
        self.n_steps = n_steps
        self.n_actions = n_actions
        self.gamma = gamma
        self.split_into = split_into
        self.controller = Controller(observation_shape, n_actions)
        self.metacontroller = MetaController(self.controller)#, gru0_size)

        # Prepare replay pool
        self.controller_env = SessionPoolEnvironment(observations=self.controller.agent.observation_layers,
                                                     actions=self.controller.resolver,
                                                     agent_memories=self.controller.agent.agent_states)
        self.metacontroller_env = SessionPoolEnvironment(observations=self.metacontroller.agent.observation_layers,
                                                         actions=self.metacontroller.resolver,
                                                         agent_memories=self.metacontroller.agent.agent_states)

        # get interaction sessions
        observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \
            pool.interact(self.step, n_steps=self.replay_seq_len)
        preceding_memory_states = list(pool.prev_memory_states)
        self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor,
                         memory_log, preceding_memory_states)

        if pool_size is None:
            controller_batch_env = self.controller_env
            metacontroller_batch_env = self.metacontroller_env
        else:
            controller_batch_env = self.controller_env.sample_session_batch(self.replay_batch_size)
            metacontroller_batch_env = self.metacontroller_env.sample_session_batch(self.replay_batch_size)

        self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \
                    self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10)
        self.eval_fun = self.build_eval_fun(metacontroller_batch_env)

        weights = self.controller.weights + self.metacontroller.weights
        updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01)
        mean_session_reward = metacontroller_batch_env.rewards.sum(axis=1).mean()
        train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates)
        super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env],
                                                pool, train_fun, pool_size, replay_seq_len)
        # raise NotImplementedError

    def reload_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor,
                    memory_tensor, preceding_memory_states):
        batch_size = observation_tensor.shape[0]
        # print observation_tensor.shape
        meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:]
        itr = itrs[0]

        pivot = len(self.controller.agent.state_variables)
        controller_preceding_states = preceding_memory_states[:pivot]
        metacontroller_preceding_states = preceding_memory_states[pivot:-4]

        ###CONTROLLER###
        # load them into experience replay environment for controller

        # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!!
        ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into)

        intrinsic_rewards = np.concatenate([np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1)
        # print [observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]),
        #                                  goal_log.reshape(ctrl_shape)][0].shape
        self.controller_env.load_sessions([observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]),
                                          goal_log.reshape(ctrl_shape)],
                                          action_tensor.reshape(ctrl_shape),
                                          intrinsic_rewards.reshape(ctrl_shape),
                                          is_alive_tensor.reshape(ctrl_shape),
                                          # controller_preceding_states
                                          )

        ###METACONTROLLER###
        # separate case for metacontroller
        extrinsic_reward_sums = np.diff(
            np.concatenate(
                [np.zeros_like(extrinsic_reward_tensor[:, 0, None]),
                 extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]],
                axis=1
            )
        )

        self.metacontroller_env.load_sessions(meta_obs_log[:, itr == 0][:, :10],
                                              goal_log[:, itr == 0][:, :10],
                                              extrinsic_reward_sums[:, :10],
                                              is_alive_tensor[:, itr == 0][:, :10],
                                              metacontroller_preceding_states)

    def update_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor,
                    memory_tensor, preceding_memory_states):
        batch_size = observation_tensor.shape[0]
        meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:]
        itr = itrs[0]

        pivot = len(self.controller.agent.state_variables)
        controller_preceding_states = preceding_memory_states[:pivot]
        metacontroller_preceding_states = preceding_memory_states[pivot:-4]

        ###CONTROLLER###
        # load them into experience replay environment for controller

        # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!!
        ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into)

        intrinsic_rewards = np.concatenate([np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1)
        self.controller_env.append_sessions([observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]),
                                            goal_log.reshape(ctrl_shape)],
                                            action_tensor.reshape(ctrl_shape),
                                            intrinsic_rewards.reshape(ctrl_shape),
                                            is_alive_tensor.reshape(ctrl_shape),
                                            controller_preceding_states,
                                            max_pool_size=self.pool_size,
                                           )

        ###METACONTROLLER###
        # separate case for metacontroller
        extrinsic_reward_sums = np.diff(
            np.concatenate(
                [np.zeros_like(extrinsic_reward_tensor[:, 0, None]),
                 extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]],
                axis=1
            )
        )

        self.metacontroller_env.append_sessions(meta_obs_log[:, itr == 0][:, :10],
                                              goal_log[:, itr == 0][:, :10],
                                              extrinsic_reward_sums[:, :10],
                                              is_alive_tensor[:, itr == 0][:, :10],
                                              metacontroller_preceding_states,
                                              max_pool_size=self.pool_size)

    def step(self, env_observation, prev_memories='zeros'):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""

        batch_size = self.n_parallel_games

        if prev_memories == 'zeros':
            controller_mem = metacontroller_mem = 'zeros'
            meta_inp = np.zeros((batch_size,) + tuple(self.metacontroller.observation_shape[1:]), dtype='float32')
            itr = -1
            # goal will be defined by "if itr ==0" clause
        else:
            pivot = len(self.controller.agent.state_variables)
            controller_mem, metacontroller_mem = prev_memories[:pivot], prev_memories[pivot:-4]
            meta_inp, goal, meta_V, itrs = prev_memories[-4:]
            itr = itrs[0]

        itr = (itr + 1) % self.metacontroller.period

        if itr == 0:
            goal, metacontroller_mem, meta_V = self.metacontroller.step(meta_inp, metacontroller_mem, batch_size)

        #print env_observation.shape
        action, controller_mem, meta_inp = self.controller.step(env_observation, goal, controller_mem, batch_size)

        new_memories = controller_mem + metacontroller_mem + [meta_inp, goal, meta_V, [itr] * self.n_parallel_games]

        return action, new_memories

    def build_loss(self, env, agent, replay_seq_len):
        # get agent's Qvalues obtained via experience replay
        _, _, _, _, qvalues_seq = agent.get_sessions(
            env,
            # initial_hidden = env.preceding_agent_memories,
            session_length=replay_seq_len,
            batch_size=env.batch_size,
            optimize_experience_replay=True,
        )

        scaled_reward_seq = env.rewards

        elwise_mse_loss = qlearning_n_step.get_elementwise_objective(qvalues_seq,
                                                                     env.actions[0],
                                                                     scaled_reward_seq,
                                                                     env.is_alive,
                                                                     gamma_or_gammas=self.gamma,
                                                                     n_steps=self.n_steps)

        # compute mean over "alive" fragments
        mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

        # regularize network weights

        reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10 ** -5

        return mse_loss + reg_l2

    def build_eval_fun(self, env):

        mean_session_reward = env.rewards.sum(axis=1).mean() / self.replay_seq_len

        eval_fun = theano.function([], [mean_session_reward])
        return eval_fun
Пример #10
0
class AtariGamePool(object):
    def __init__(self, agent, game_title, n_games, max_size=None, **kwargs):
        """
        A pool that stores several
           - game states (gym environment)
           - prev_observations - last agent observations
           - prev memory states - last agent hidden states

        :param game_title: name of the game. See here http://yavar.naddaf.name/ale/list_of_current_games.html
        :param n_games: number of parallel games
        :param kwargs: options passed to Atari when creating a game. See Atari __init__
        """
        #create atari games
        self.game_kwargs = kwargs
        self.game_title = game_title
        self.games = [
            Atari(self.game_title, **self.game_kwargs) for _ in range(n_games)
        ]

        #initial observations
        self.prev_observations = [atari.reset() for atari in self.games]

        #agent memory variables (if you use recurrent networks
        self.prev_memory_states = [
            np.zeros((n_games, ) + tuple(mem.output_shape[1:]),
                     dtype=get_layer_dtype(mem)) for mem in agent.agent_states
        ]

        #save agent
        self.agent = agent
        self.agent_step = agent.get_react_function()

        # Create experience replay environment
        self.experience_replay = SessionPoolEnvironment(
            observations=agent.observation_layers,
            actions=agent.action_layers,
            agent_memories=agent.agent_states)
        self.max_size = max_size

    def interact(self, n_steps=100, verbose=False):
        """generate interaction sessions with ataries (openAI gym atari environments)
        Sessions will have length n_steps.
        Each time one of games is finished, it is immediately getting reset


        params:
            agent_step: a function(observations,memory_states) -> actions,new memory states for agent update
            n_steps: length of an interaction
            verbose: if True, prints small debug message whenever a game gets reloaded after end.
        returns:
            observation_log,action_log,reward_log,[memory_logs],is_alive_log,info_log
            a bunch of tensors [batch, tick, size...]

            the only exception is info_log, which is a list of infos for [time][batch]
        """
        history_log = []
        for i in range(n_steps):
            res = self.agent_step(self.prev_observations,
                                  *self.prev_memory_states)
            actions, new_memory_states = res[0], res[1:]

            new_observations, cur_rewards, is_done, infos = \
                zip(*map(
                    lambda atari, action: atari.step(action),
                    self.games,
                    actions)
                    )

            new_observations = np.array(new_observations)

            for i in range(len(self.games)):
                if is_done[i]:
                    new_observations[i] = self.games[i].reset()

                    for m_i in range(len(new_memory_states)):
                        new_memory_states[m_i][i] = 0

                    if verbose:
                        print("atari %i reloaded" % i)

            # append observation -> action -> reward tuple
            history_log.append((self.prev_observations, actions, cur_rewards,
                                new_memory_states, is_done, infos))

            self.prev_observations = new_observations
            self.prev_memory_states = new_memory_states

        # cast to numpy arrays
        observation_log, action_log, reward_log, memories_log, is_done_log, info_log = zip(
            *history_log)

        # tensor dimensions
        # [batch_i, time_i, observation_size...]
        observation_log = np.array(observation_log).swapaxes(0, 1)

        # [batch, time, units] for each memory tensor
        memories_log = map(lambda mem: np.array(mem).swapaxes(0, 1),
                           zip(*memories_log))

        # [batch_i,time_i]
        action_log = np.array(action_log).swapaxes(0, 1)

        # [batch_i, time_i]
        reward_log = np.array(reward_log).swapaxes(0, 1)

        # [batch_i, time_i]
        is_alive_log = 1 - np.array(is_done_log, dtype='int8').swapaxes(0, 1)

        return observation_log, action_log, reward_log, memories_log, is_alive_log, info_log

    def update(self, n_steps=100, append=False, max_size=None):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(self.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = self.interact(
            n_steps=n_steps)

        # load them into experience replay environment
        if not append:
            self.experience_replay.load_sessions(observation_tensor,
                                                 action_tensor, reward_tensor,
                                                 is_alive_tensor,
                                                 preceding_memory_states)
        else:
            self.experience_replay.append_sessions(observation_tensor,
                                                   action_tensor,
                                                   reward_tensor,
                                                   is_alive_tensor,
                                                   preceding_memory_states,
                                                   max_pool_size=max_size
                                                   or self.max_size)

    def evaluate(self,
                 n_games=1,
                 save_path="./records",
                 record_video=True,
                 verbose=True,
                 t_max=10000):
        """
        Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward
        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = Atari(self.game_title, **self.game_kwargs)

        if record_video:
            env.monitor.start(save_path, force=True)
        else:
            env.monitor.start(save_path, lambda i: False, force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [
                np.zeros((1, ) + tuple(mem.output_shape[1:]),
                         dtype=get_layer_dtype(mem))
                for mem in self.agent.agent_states
            ]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(observation[None, ...], *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])
                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print(
                            "Episode finished after {} timesteps with reward={}"
                            .format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.monitor.close()
        del env
        return np.mean(game_rewards)