Пример #1
0
class MetaController:
    def __init__(
        self,
        controller,
        gru0_size=128,
    ):

        #image observation at current tick goes here
        self.observed_state = InputLayer(controller.dnn_output.output_shape,
                                         name="cnn output")

        prev_gru0 = InputLayer((None, gru0_size), name='prev gru0')

        self.gru0 = GRUCell(prev_state=prev_gru0,
                            input_or_inputs=self.observed_state)

        memory_dict = {self.gru0: prev_gru0}

        #q_eval
        q_eval = DenseLayer(self.gru0,
                            num_units=controller.n_goals,
                            nonlinearity=lasagne.nonlinearities.linear,
                            name="QEvaluator")

        #resolver
        self.resolver = EpsilonGreedyResolver(q_eval, name="resolver")

        #all together
        self.agent = Agent(self.observed_state, memory_dict, q_eval,
                           [self.resolver, q_eval])

        self.controller = controller
        self.observation_shape = controller.dnn_output.output_shape
        self.n_goals = controller.n_goals
        self.period = controller.metacontroller_period
        self.applier_fun = self.agent.get_react_function()

        self.weights = lasagne.layers.get_all_params(self.resolver,
                                                     trainable=True)

    def step(self, observation, prev_memories, batch_size):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros

        if prev_memories == 'zeros':
            prev_memories = [
                np.zeros((batch_size, ) + tuple(mem.output_shape[1:]),
                         dtype='float32') for mem in self.agent.agent_states
            ]
        res = self.applier_fun(np.array(observation), *prev_memories)
        action, q_eval = res[:2]
        memories = res[2:]
        return action, memories, q_eval.max(axis=-1)
Пример #2
0
    def build_model(self):

        # reshape to [batch, color, x, y] to allow for convolution layers to work correctly
        observation_reshape = DimshuffleLayer(self.observation_layer,
                                              (0, 3, 1, 2))
        observation_reshape = Pool2DLayer(observation_reshape,
                                          pool_size=(2, 2))

        # memory
        window_size = 5
        # prev state input
        prev_window = InputLayer(
            (None, window_size) + tuple(observation_reshape.output_shape[1:]),
            name="previous window state")

        # our window
        memory_layer = WindowAugmentation(observation_reshape,
                                          prev_window,
                                          name="new window state")

        memory_dict = {memory_layer: prev_window}

        # pixel-wise maximum over the temporal window (to avoid flickering)
        memory_layer = ExpressionLayer(memory_layer,
                                       lambda a: a.max(axis=1),
                                       output_shape=(None, ) +
                                       memory_layer.output_shape[2:])

        # neural network body
        nn = batch_norm(
            lasagne.layers.Conv2DLayer(memory_layer,
                                       num_filters=16,
                                       filter_size=(8, 8),
                                       stride=(4, 4)))
        nn = batch_norm(
            lasagne.layers.Conv2DLayer(nn,
                                       num_filters=32,
                                       filter_size=(4, 4),
                                       stride=(2, 2)))
        nn = batch_norm(lasagne.layers.DenseLayer(nn, num_units=256))
        # q_eval
        policy_layer = DenseLayer(nn,
                                  num_units=self.n_actions,
                                  nonlinearity=lasagne.nonlinearities.linear,
                                  name="QEvaluator")
        # resolver
        resolver = EpsilonGreedyResolver(policy_layer, name="resolver")

        # all together
        agent = Agent(self.observation_layer, memory_dict, policy_layer,
                      resolver)

        return resolver, agent
Пример #3
0
    def __init__(
        self,
        controller,
        gru0_size=128,
    ):

        #image observation at current tick goes here
        self.observed_state = InputLayer(controller.dnn_output.output_shape,
                                         name="cnn output")

        prev_gru0 = InputLayer((None, gru0_size), name='prev gru0')

        self.gru0 = GRUCell(prev_state=prev_gru0,
                            input_or_inputs=self.observed_state)

        memory_dict = {self.gru0: prev_gru0}

        #q_eval
        q_eval = DenseLayer(self.gru0,
                            num_units=controller.n_goals,
                            nonlinearity=lasagne.nonlinearities.linear,
                            name="QEvaluator")

        #resolver
        self.resolver = EpsilonGreedyResolver(q_eval, name="resolver")

        #all together
        self.agent = Agent(self.observed_state, memory_dict, q_eval,
                           [self.resolver, q_eval])

        self.controller = controller
        self.observation_shape = controller.dnn_output.output_shape
        self.n_goals = controller.n_goals
        self.period = controller.metacontroller_period
        self.applier_fun = self.agent.get_react_function()

        self.weights = lasagne.layers.get_all_params(self.resolver,
                                                     trainable=True)
Пример #4
0
def build_agent(action_shape, state_shape):
    observation_layer = InputLayer((None, *state_shape))

    net = DenseLayer(observation_layer,
                     10,
                     nonlinearity=lasagne.nonlinearities.sigmoid,
                     name='dense1')
    # net = DenseLayer(net, 256, name='dense2')

    # a layer that predicts Qvalues

    policy_layer_flattened = DenseLayer(
        net,
        num_units=np.prod(action_shape),
        nonlinearity=lasagne.nonlinearities.softmax,
        name="q-evaluator layer")

    policy_layer = ReshapeLayer(policy_layer_flattened, ([0], *action_shape))

    V_layer = DenseLayer(net, 1, nonlinearity=None, name="state values")

    # Pick actions at random proportionally to te probabilities
    action_layer = MultiProbabilisticResolver(policy_layer,
                                              name="e-greedy action picker",
                                              assume_normalized=True)

    # print("ActionL: ", action_layer.output_shape)

    # action_layer = ActionEncoder(
    #     action_layer,
    #     base=3)

    # print("ActionL': ", action_layer.output_shape)

    # action_layer = T.printing.Print("A")(action_layer)

    # all together
    agent = Agent(observation_layers=observation_layer,
                  policy_estimators=(policy_layer_flattened, V_layer),
                  action_layers=action_layer)

    return agent, action_layer, V_layer
Пример #5
0
print(get_output_shape(qvalues_layer))

#baseline for all qvalues
#qvalues_layer = DenseLayer(dense,n_actions,nonlinearity=None,name='qval')
#sample actions proportionally to policy_layer
from agentnet.resolver import EpsilonGreedyResolver
action_layer = EpsilonGreedyResolver(qvalues_layer)

from agentnet.target_network import TargetNetwork
targetnet = TargetNetwork(qvalues_layer)
qvalues_old = targetnet.output_layers
from agentnet.agent import Agent
#all together
agent = Agent(
    observation_layers=observation_layer,
    policy_estimators=(qvalues_layer, qvalues_old),
    #agent_states={conv_rec:in_conv_rec},
    action_layers=action_layer)

#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params(qvalues_layer, trainable=True)
weights

from agentnet.experiments.openai_gym.pool import EnvPool

pool = EnvPool(agent, make_env, N_AGENTS)

## %%time
##interact for 7 ticks
#_,action_log,reward_log,_,_,_  = pool.interact(10)
#
Пример #6
0
def test_space_invaders(
    game_title='SpaceInvaders-v0',
    n_parallel_games=3,
    replay_seq_len=2,
):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None, ) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    window_size = 3

    # prev state input
    prev_window = InputLayer(
        (None, window_size) + tuple(observation_reshape.output_shape[1:]),
        name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    memory_dict = {window: prev_window}

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None, ) +
                                 window.output_shape[2:])

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(window_max, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    #fakes for a2c
    policy_eval = DenseLayer(nn,
                             num_units=n_actions,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             name="a2c action probas")
    state_value_eval = DenseLayer(nn,
                                  num_units=1,
                                  nonlinearity=None,
                                  name="a2c state values")
    # resolver
    resolver = ProbabilisticResolver(policy_eval, name="resolver")

    # agent
    agent = Agent(observation_layer, memory_dict,
                  (q_eval, policy_eval, state_value_eval), resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    print('compiling react')
    applier_fun = agent.get_react_function()

    # a nice pythonic interface
    def step(observation, prev_memories='zeros', batch_size=n_parallel_games):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros
        if prev_memories == 'zeros':
            prev_memories = [
                np.zeros((batch_size, ) + tuple(mem.output_shape[1:]),
                         dtype='float32') for mem in agent.agent_states
            ]
        res = applier_fun(np.array(observation), *prev_memories)
        action = res[0]
        memories = res[1:]
        return action, memories

    # # Create and manage a pool of atari sessions to play with

    pool = GamePool(game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(
            step, n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values, policy, etc obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )
    (q_values_sequence, policy_sequence, value_sequence) = estimators

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = 0.

    #1-step algos
    for algo in qlearning, sarsa:
        elwise_mse_loss += algo.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
        )
    #qlearning_n_step
    for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1,
              None):
        elwise_mse_loss += qlearning_n_step.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
            n_steps=n)

    #a2c n_step

    elwise_mse_loss += a2c_n_step.get_elementwise_objective(
        policy_sequence,
        value_sequence[:, :, 0],
        env.actions[0],
        scaled_reward_seq,
        env.is_alive,
        gamma_or_gammas=0.99,
        n_steps=3)

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward],
                                updates=updates)
    evaluation_fun = theano.function(
        [], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " %
              (epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Пример #7
0
def test_memory(
    game_title='SpaceInvaders-v0',
    n_parallel_games=3,
    replay_seq_len=2,
):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None, ) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states

    memory_dict = OrderedDict([])

    ###Window
    window_size = 3

    # prev state input
    prev_window = InputLayer(
        (None, window_size) + tuple(observation_reshape.output_shape[1:]),
        name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None, ) +
                                 window.output_shape[2:])

    memory_dict[window] = prev_window

    ###Stack
    #prev stack
    stack_w, stack_h = 4, 5
    stack_inputs = DenseLayer(observation_reshape, stack_w, name="prev_stack")
    stack_controls = DenseLayer(observation_reshape,
                                3,
                                nonlinearity=lasagne.nonlinearities.softmax,
                                name="prev_stack")
    prev_stack = InputLayer((None, stack_h, stack_w),
                            name="previous stack state")
    stack = StackAugmentation(stack_inputs, prev_stack, stack_controls)
    memory_dict[stack] = prev_stack

    stack_top = lasagne.layers.SliceLayer(stack, 0, 1)

    ###RNN preset

    prev_rnn = InputLayer((None, 16), name="previous RNN state")
    new_rnn = RNNCell(prev_rnn, observation_reshape)
    memory_dict[new_rnn] = prev_rnn

    ###GRU preset
    prev_gru = InputLayer((None, 16), name="previous GRUcell state")
    new_gru = GRUCell(prev_gru, observation_reshape)
    memory_dict[new_gru] = prev_gru

    ###GRUmemorylayer
    prev_gru1 = InputLayer((None, 15), name="previous GRUcell state")
    new_gru1 = GRUMemoryLayer(15, observation_reshape, prev_gru1)
    memory_dict[new_gru1] = prev_gru1

    #LSTM with peepholes
    prev_lstm0_cell = InputLayer(
        (None, 13), name="previous LSTMCell hidden state [with peepholes]")

    prev_lstm0_out = InputLayer(
        (None, 13), name="previous LSTMCell output state [with peepholes]")

    new_lstm0_cell, new_lstm0_out = LSTMCell(
        prev_lstm0_cell,
        prev_lstm0_out,
        input_or_inputs=observation_reshape,
        peepholes=True,
        name="newLSTM1 [with peepholes]")

    memory_dict[new_lstm0_cell] = prev_lstm0_cell
    memory_dict[new_lstm0_out] = prev_lstm0_out

    #LSTM without peepholes
    prev_lstm1_cell = InputLayer(
        (None, 14), name="previous LSTMCell hidden state [no peepholes]")

    prev_lstm1_out = InputLayer(
        (None, 14), name="previous LSTMCell output state [no peepholes]")

    new_lstm1_cell, new_lstm1_out = LSTMCell(
        prev_lstm1_cell,
        prev_lstm1_out,
        input_or_inputs=observation_reshape,
        peepholes=False,
        name="newLSTM1 [no peepholes]")

    memory_dict[new_lstm1_cell] = prev_lstm1_cell
    memory_dict[new_lstm1_out] = prev_lstm1_out

    ##concat everything

    for i in [flatten(window_max), stack_top, new_rnn, new_gru, new_gru1]:
        print(i.output_shape)
    all_memory = concat([
        flatten(window_max),
        stack_top,
        new_rnn,
        new_gru,
        new_gru1,
        new_lstm0_out,
        new_lstm1_out,
    ])

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(all_memory, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    # resolver
    resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver")

    # agent
    agent = Agent(observation_layer, memory_dict, q_eval, resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    print('compiling react')
    applier_fun = agent.get_react_function()

    # a nice pythonic interface
    def step(observation, prev_memories='zeros', batch_size=n_parallel_games):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros
        if prev_memories == 'zeros':
            prev_memories = [
                np.zeros((batch_size, ) + tuple(mem.output_shape[1:]),
                         dtype='float32') for mem in agent.agent_states
            ]
        res = applier_fun(np.array(observation), *prev_memories)
        action = res[0]
        memories = res[1:]
        return action, memories

    # # Create and manage a pool of atari sessions to play with

    pool = GamePool(game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(
            step, n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = qlearning.get_elementwise_objective(
        q_values_sequence,
        env.actions[0],
        scaled_reward_seq,
        env.is_alive,
        gamma_or_gammas=0.99,
    )

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward],
                                updates=updates)
    evaluation_fun = theano.function(
        [], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " %
              (epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Пример #8
0
#we need to define the new input map because concatenated_memory is a ConcatLayer and does not have default one


def custom_input_map(last_hidden, observation):
    """just a function that maps memory states to respective inputs"""
    return {
        _prev_gru1_layer: last_hidden[:, 0:n_hid_1],
        _prev_gru2_layer: last_hidden[:, n_hid_1:n_hid_1 + n_hid_2],
        _observation_layer: observation
    }


#all together
agent = Agent(concatenated_memory,
              q_eval,
              resolver,
              input_map=custom_input_map)

##load weights from snapshot

snapshot_path = "./demo_stand.qlearning_3_step.epoch60000.pcl"
snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1"

from agentnet.utils import load
if not os.path.isfile(snapshot_path):
    print "loading snapshot..."
    if sys.version_info[0] == 2:
        from urllib import urlretrieve
    else:
        from urllib.request import urlretrieve
    urlretrieve(snapshot_url, snapshot_path)
    def __init__(self,
                 observation_shape,
                 n_actions,
                 n_goals=32,
                 metacontroller_period=5,
                 window_size=3,
                 embedding_size=128,
                 ):

        #image observation at current tick goes here
        self.observation_layer = InputLayer(observation_shape,
                                       name="images input")


        #reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
        observation_reshape = DimshuffleLayer(self.observation_layer,(0,3,1,2))

        observation_reshape = lasagne.layers.Pool2DLayer(observation_reshape,(2,2),mode='average_inc_pad')



        #prev state input
        prev_window = InputLayer((None,window_size)+tuple(observation_reshape.output_shape[1:]),
                                name = "previous window state")
        #our window
        window = WindowAugmentation(observation_reshape,
                                    prev_window,
                                    name = "new window state")
        # pixel-wise maximum over the temporal window (to avoid flickering)
        window_max = ExpressionLayer(window,
                                     lambda a: a.max(axis=1),
                                     output_shape=(None,) + window.output_shape[2:])


        memory_dict = {window: prev_window}

        #a simple lasagne network (try replacing with any other lasagne network and see what works best)
        nn = batch_norm(Conv2DLayer(window_max,16,filter_size=8,stride=(4,4), name='cnn0'))
        nn = batch_norm(Conv2DLayer(nn,32,filter_size=4,stride=(2,2), name='cnn1'))
        nn = batch_norm(Conv2DLayer(nn,64,filter_size=4,stride=(2,2), name='cnn2'))

        #nn = DropoutLayer(nn,name = "dropout", p=0.05) #will get deterministic during evaluation
        self.dnn_output = nn = DenseLayer(nn,num_units=256,name='dense1')


        self.goal_layer = InputLayer((None,), T.ivector(), name='boss goal')
        self.goal_layer.output_dtype = 'int32'
        goal_emb = EmbeddingLayer(self.goal_layer, n_goals, embedding_size)

        nn = lasagne.layers.ConcatLayer([goal_emb,nn])


        #q_eval
        q_eval = DenseLayer(nn,
                           num_units = n_actions,
                           nonlinearity=lasagne.nonlinearities.linear,
                           name="QEvaluator")

        #resolver
        self.resolver = EpsilonGreedyResolver(q_eval,name="resolver")

        #all together
        self.agent = Agent([self.observation_layer,self.goal_layer],
                      memory_dict,
                      q_eval,
                      [self.resolver,self.dnn_output])



        self.observation_shape = observation_shape
        self.n_actions = n_actions
        self.n_goals = n_goals
        self.metacontroller_period = metacontroller_period
        self.window_size = window_size
        self.embedding_size = embedding_size

        self.applier_fun = self.agent.get_react_function()
        
        self.weights = lasagne.layers.get_all_params(self.resolver,trainable=True)
Пример #10
0
def test_space_invaders(game_title='SpaceInvaders-v0',
                        n_parallel_games=3,
                        replay_seq_len=2,
                        ):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None,) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    window_size = 3

    # prev state input
    prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]),
                             name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    memory_dict = {window: prev_window}

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None,) + window.output_shape[2:])

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(window_max, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    #fakes for a2c
    policy_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.softmax,
                        name="a2c action probas")
    state_value_eval = DenseLayer(nn,
                        num_units=1,
                        nonlinearity=None,
                        name="a2c state values")
    # resolver
    resolver = ProbabilisticResolver(policy_eval,  name="resolver")

    # agent
    agent = Agent(observation_layer,
                  memory_dict,
                  (q_eval,policy_eval,state_value_eval), resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    print('compiling react')
    applier_fun = agent.get_react_function()

    # a nice pythonic interface
    def step(observation, prev_memories='zeros', batch_size=n_parallel_games):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros
        if prev_memories == 'zeros':
            prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]),
                                      dtype='float32')
                             for mem in agent.agent_states]
        res = applier_fun(np.array(observation), *prev_memories)
        action = res[0]
        memories = res[1:]
        return action, memories

    # # Create and manage a pool of atari sessions to play with

    pool = GamePool(game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values, policy, etc obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )
    (q_values_sequence,policy_sequence,value_sequence) = estimators

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = 0.
    
    #1-step algos
    for algo in qlearning,sarsa:
        elwise_mse_loss += algo.get_elementwise_objective(q_values_sequence,
                                                              env.actions[0],
                                                              scaled_reward_seq,
                                                              env.is_alive,
                                                              gamma_or_gammas=0.99, )
    #qlearning_n_step
    for n in (1,3,replay_seq_len-1, replay_seq_len, replay_seq_len+1,None):
        elwise_mse_loss += qlearning_n_step.get_elementwise_objective(q_values_sequence,
                                                              env.actions[0],
                                                              scaled_reward_seq,
                                                              env.is_alive,
                                                              gamma_or_gammas=0.99,
                                                              n_steps=n)
        
    #a2c n_step
    
    elwise_mse_loss += a2c_n_step.get_elementwise_objective(policy_sequence,
                                                            value_sequence[:,:,0],
                                                            env.actions[0],
                                                            scaled_reward_seq,
                                                            env.is_alive,
                                                            gamma_or_gammas=0.99,
                                                            n_steps=3)
    
    

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward], updates=updates)
    evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " % (
            epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Пример #11
0
resolver = EpsilonGreedyResolver(q_eval,epsilon=epsilon,name="resolver")



#we need to define the new input map because concatenated_memory is a ConcatLayer and does not have default one

def custom_input_map(last_hidden,observation):
    """just a function that maps memory states to respective inputs"""
    return {
        _prev_gru1_layer:last_hidden[:,0:n_hid_1],
        _prev_gru2_layer:last_hidden[:,n_hid_1:n_hid_1+n_hid_2],
        _observation_layer:observation
    }

#all together
agent = Agent(concatenated_memory,q_eval,resolver,input_map=custom_input_map
             )




##load weights from snapshot

snapshot_path ="./demo_stand.qlearning_3_step.epoch60000.pcl"
snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1"

from agentnet.utils import load
if not os.path.isfile(snapshot_path):
    print "loading snapshot..."
    if sys.version_info[0] == 2:
        from urllib import urlretrieve
    else:
Пример #12
0
def test_memory(game_title='SpaceInvaders-v0',
                        n_parallel_games=3,
                        replay_seq_len=2,
                        ):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None,) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    
    memory_dict = OrderedDict([])
    
    
    ###Window
    window_size = 3

    # prev state input
    prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]),
                             name="previous window state")
    

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")
    
    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None,) + window.output_shape[2:])

    
    memory_dict[window] = prev_window
    
    ###Stack
    #prev stack
    stack_w,stack_h = 4, 5
    stack_inputs = DenseLayer(observation_reshape,stack_w,name="prev_stack")
    stack_controls = DenseLayer(observation_reshape,3,
                              nonlinearity=lasagne.nonlinearities.softmax,
                              name="prev_stack")
    prev_stack = InputLayer((None,stack_h,stack_w),
                             name="previous stack state")
    stack = StackAugmentation(stack_inputs,prev_stack, stack_controls)
    memory_dict[stack] = prev_stack
    
    stack_top = lasagne.layers.SliceLayer(stack,0,1)

    
    ###RNN preset
    
    prev_rnn = InputLayer((None,16),
                             name="previous RNN state")
    new_rnn = RNNCell(prev_rnn,observation_reshape)
    memory_dict[new_rnn] = prev_rnn
    
    ###GRU preset
    prev_gru = InputLayer((None,16),
                             name="previous GRUcell state")
    new_gru = GRUCell(prev_gru,observation_reshape)
    memory_dict[new_gru] = prev_gru
    
    ###GRUmemorylayer
    prev_gru1 = InputLayer((None,15),
                             name="previous GRUcell state")
    new_gru1 = GRUMemoryLayer(15,observation_reshape,prev_gru1)
    memory_dict[new_gru1] = prev_gru1
    
    #LSTM with peepholes
    prev_lstm0_cell = InputLayer((None,13),
                             name="previous LSTMCell hidden state [with peepholes]")
    
    prev_lstm0_out = InputLayer((None,13),
                             name="previous LSTMCell output state [with peepholes]")

    new_lstm0_cell,new_lstm0_out = LSTMCell(prev_lstm0_cell,prev_lstm0_out,
                                            input_or_inputs = observation_reshape,
                                            peepholes=True,name="newLSTM1 [with peepholes]")
    
    memory_dict[new_lstm0_cell] = prev_lstm0_cell
    memory_dict[new_lstm0_out] = prev_lstm0_out


    #LSTM without peepholes
    prev_lstm1_cell = InputLayer((None,14),
                             name="previous LSTMCell hidden state [no peepholes]")
    
    prev_lstm1_out = InputLayer((None,14),
                             name="previous LSTMCell output state [no peepholes]")

    new_lstm1_cell,new_lstm1_out = LSTMCell(prev_lstm1_cell,prev_lstm1_out,
                                            input_or_inputs = observation_reshape,
                                            peepholes=False,name="newLSTM1 [no peepholes]")
    
    memory_dict[new_lstm1_cell] = prev_lstm1_cell
    memory_dict[new_lstm1_out] = prev_lstm1_out
    
    ##concat everything
    
    for i in [flatten(window_max),stack_top,new_rnn,new_gru,new_gru1]:
        print(i.output_shape)
    all_memory = concat([flatten(window_max),stack_top,new_rnn,new_gru,new_gru1,new_lstm0_out,new_lstm1_out,])
    
    
    

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc


    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(all_memory, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    # resolver
    resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver")

    # agent
    agent = Agent(observation_layer,
                  memory_dict,
                  q_eval, resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    print('compiling react')
    applier_fun = agent.get_react_function()

    # a nice pythonic interface
    def step(observation, prev_memories='zeros', batch_size=n_parallel_games):
        """ returns actions and new states given observation and prev state
        Prev state in default setup should be [prev window,]"""
        # default to zeros
        if prev_memories == 'zeros':
            prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]),
                                      dtype='float32')
                             for mem in agent.agent_states]
        res = applier_fun(np.array(observation), *prev_memories)
        action = res[0]
        memories = res[1:]
        return action, memories

    # # Create and manage a pool of atari sessions to play with

    pool = GamePool(game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working


    elwise_mse_loss = qlearning.get_elementwise_objective(q_values_sequence,
                                                          env.actions[0],
                                                          scaled_reward_seq,
                                                          env.is_alive,
                                                          gamma_or_gammas=0.99, )

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward], updates=updates)
    evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " % (
            epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
Пример #13
0
def test_reasoning_value_based(n_parallel_games=25,
                               algo = qlearning,
                               n_steps=1
                  ):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param algo: training algorithm to use (module)
    """
    # instantiate an experiment environment with default parameters
    env = experiment.BooleanReasoningEnvironment()

    # hidden neurons
    n_hidden_neurons = 64

    observation_size = (None,) + tuple(env.observation_shapes)

    observation_layer = lasagne.layers.InputLayer(observation_size, name="observation_input")
    prev_state_layer = lasagne.layers.InputLayer([None, n_hidden_neurons], name="prev_state_input")

    # memory layer (this isn't the same as lasagne recurrent units)
    rnn = RNNCell(prev_state_layer, observation_layer, name="rnn0")

    # q_values (estimated using very simple neural network)
    q_values = lasagne.layers.DenseLayer(rnn,
                                         num_units=env.n_actions,
                                         nonlinearity=lasagne.nonlinearities.linear,
                                         name="QEvaluator")

    # resolver uses epsilon - parameter which defines a probability of randomly taken action.
    epsilon = theano.shared(np.float32(0.1), name="e-greedy.epsilon")
    resolver = EpsilonGreedyResolver(q_values, epsilon=epsilon, name="resolver")


    # packing this into agent
    agent = Agent(observation_layer,
                  agent_states={rnn:prev_state_layer},
                  policy_estimators=q_values, 
                  action_layers=resolver)
    
    # Since it's a lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver,trainable=True)

    
    # produce interaction sequences of length <= 10
    (state_seq,), observation_seq, agent_state, action_seq, qvalues_seq = agent.get_sessions(
        env,
        session_length=10,
        batch_size=env.batch_size,
    )

    hidden_seq = agent_state[rnn]

    # get rewards for all actions
    rewards_seq = env.get_reward_sequences(state_seq, action_seq)

    # get indicator whether session is still active
    is_alive_seq = env.get_whether_alive(observation_seq)
    
    

    # gamma - delayed reward coefficient - what fraction of reward is retained if it is obtained one tick later
    gamma = theano.shared(np.float32(0.99), name='q_learning_gamma')

    squarred_Qerror = algo.get_elementwise_objective(
        qvalues_seq,
        action_seq,
        rewards_seq,
        is_alive_seq,
        gamma_or_gammas=gamma)

    # take sum over steps, average over sessions
    mse_Qloss = squarred_Qerror.sum(axis=1).mean()
    
    
    # impose l2 regularization on network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-3

    loss = mse_Qloss + reg_l2
    
    
    # compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.1)
    # take sum over steps, average over sessions
    mean_session_reward = rewards_seq.sum(axis=1).mean()

    train_fun = theano.function([], [loss, mean_session_reward], updates=updates)

    compute_mean_session_reward = theano.function([], mean_session_reward)


    score_log = Metrics()
        
    for epoch in range(5000):        

        # update resolver's epsilon (chance of random action instead of optimal one)
        # epsilon decreases over time
        current_epsilon = 0.05 + 0.95 * np.exp(-epoch / 2500.)
        resolver.epsilon.set_value(np.float32(current_epsilon))

        # train
        env.generate_new_data_batch(n_parallel_games)
        loss, avg_reward = train_fun()

        # show current learning progress
        if epoch % 100 == 0:
            print(epoch),

            # estimate reward for epsilon-greedy strategy
            avg_reward_current = compute_mean_session_reward()
            score_log["expected epsilon-greedy reward"][epoch] = avg_reward_current

            # estimating the reward under assumption of greedy strategy
            resolver.epsilon.set_value(0)
            avg_reward_greedy = compute_mean_session_reward()
            score_log["expected greedy reward"][epoch] = avg_reward_greedy
            
            
            if avg_reward_greedy > 2:
                print("converged")
                break
    else:
        print("diverged")
        raise ValueError("Algorithm diverged")
            
Пример #14
0
    def __init__(
        self,
        observation_shape,
        n_actions,
        n_goals=32,
        metacontroller_period=5,
        window_size=3,
        embedding_size=128,
    ):

        #image observation at current tick goes here
        self.observation_layer = InputLayer(observation_shape,
                                            name="images input")

        #reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
        observation_reshape = DimshuffleLayer(self.observation_layer,
                                              (0, 3, 1, 2))

        observation_reshape = lasagne.layers.Pool2DLayer(
            observation_reshape, (2, 2), mode='average_inc_pad')

        #prev state input
        prev_window = InputLayer(
            (None, window_size) + tuple(observation_reshape.output_shape[1:]),
            name="previous window state")
        #our window
        window = WindowAugmentation(observation_reshape,
                                    prev_window,
                                    name="new window state")
        # pixel-wise maximum over the temporal window (to avoid flickering)
        window_max = ExpressionLayer(window,
                                     lambda a: a.max(axis=1),
                                     output_shape=(None, ) +
                                     window.output_shape[2:])

        memory_dict = {window: prev_window}

        #a simple lasagne network (try replacing with any other lasagne network and see what works best)
        nn = batch_norm(
            Conv2DLayer(window_max,
                        16,
                        filter_size=8,
                        stride=(4, 4),
                        name='cnn0'))
        nn = batch_norm(
            Conv2DLayer(nn, 32, filter_size=4, stride=(2, 2), name='cnn1'))
        nn = batch_norm(
            Conv2DLayer(nn, 64, filter_size=4, stride=(2, 2), name='cnn2'))

        #nn = DropoutLayer(nn,name = "dropout", p=0.05) #will get deterministic during evaluation
        self.dnn_output = nn = DenseLayer(nn, num_units=256, name='dense1')

        self.goal_layer = InputLayer((None, ), T.ivector(), name='boss goal')
        self.goal_layer.output_dtype = 'int32'
        goal_emb = EmbeddingLayer(self.goal_layer, n_goals, embedding_size)

        nn = lasagne.layers.ConcatLayer([goal_emb, nn])

        #q_eval
        q_eval = DenseLayer(nn,
                            num_units=n_actions,
                            nonlinearity=lasagne.nonlinearities.linear,
                            name="QEvaluator")

        #resolver
        self.resolver = EpsilonGreedyResolver(q_eval, name="resolver")

        #all together
        self.agent = Agent([self.observation_layer, self.goal_layer],
                           memory_dict, q_eval,
                           [self.resolver, self.dnn_output])

        self.observation_shape = observation_shape
        self.n_actions = n_actions
        self.n_goals = n_goals
        self.metacontroller_period = metacontroller_period
        self.window_size = window_size
        self.embedding_size = embedding_size

        self.applier_fun = self.agent.get_react_function()

        self.weights = lasagne.layers.get_all_params(self.resolver,
                                                     trainable=True)
Пример #15
0
                                                        #Replacing with gru1 or gru2 would mean taking one
                                   num_units = n_actions,
                                   nonlinearity=lasagne.nonlinearities.linear,name="QEvaluator")
#resolver
epsilon = theano.shared(np.float32(0.0),"e-greedy.epsilon")

resolver = EpsilonGreedyResolver(q_eval,epsilon=epsilon,name="resolver")




from collections import OrderedDict
#all together
agent = Agent(_observation_layer,
              OrderedDict([
                    (gru1,_prev_gru1_layer),
                    (gru2,_prev_gru2_layer)
              ]),
              [q_eval,concatenated_memory],resolver)





##load weights from snapshot

snapshot_path ="./demo_stand.qlearning_3_step.epoch60000.pcl"
snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1"

from agentnet.utils import load
if not os.path.isfile(snapshot_path):
    print "loading snapshot..."
Пример #16
0
n_actions = len(feature_names)
q_eval = lasagne.layers.DenseLayer(
    concatenated_memory,  #taking both memories. 
    #Replacing with gru1 or gru2 would mean taking one
    num_units=n_actions,
    nonlinearity=lasagne.nonlinearities.linear,
    name="QEvaluator")
#resolver
epsilon = theano.shared(np.float32(0.0), "e-greedy.epsilon")

resolver = EpsilonGreedyResolver(q_eval, epsilon=epsilon, name="resolver")

from collections import OrderedDict
#all together
agent = Agent(
    _observation_layer,
    OrderedDict([(gru1, _prev_gru1_layer), (gru2, _prev_gru2_layer)]),
    [q_eval, concatenated_memory], resolver)

##load weights from snapshot

snapshot_path = "./demo_stand.qlearning_3_step.epoch60000.pcl"
snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1"

from agentnet.utils import load
if not os.path.isfile(snapshot_path):
    print "loading snapshot..."
    if sys.version_info[0] == 2:
        from urllib import urlretrieve
    else:
        from urllib.request import urlretrieve
    urlretrieve(snapshot_url, snapshot_path)