class MetaController: def __init__( self, controller, gru0_size=128, ): #image observation at current tick goes here self.observed_state = InputLayer(controller.dnn_output.output_shape, name="cnn output") prev_gru0 = InputLayer((None, gru0_size), name='prev gru0') self.gru0 = GRUCell(prev_state=prev_gru0, input_or_inputs=self.observed_state) memory_dict = {self.gru0: prev_gru0} #q_eval q_eval = DenseLayer(self.gru0, num_units=controller.n_goals, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #resolver self.resolver = EpsilonGreedyResolver(q_eval, name="resolver") #all together self.agent = Agent(self.observed_state, memory_dict, q_eval, [self.resolver, q_eval]) self.controller = controller self.observation_shape = controller.dnn_output.output_shape self.n_goals = controller.n_goals self.period = controller.metacontroller_period self.applier_fun = self.agent.get_react_function() self.weights = lasagne.layers.get_all_params(self.resolver, trainable=True) def step(self, observation, prev_memories, batch_size): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [ np.zeros((batch_size, ) + tuple(mem.output_shape[1:]), dtype='float32') for mem in self.agent.agent_states ] res = self.applier_fun(np.array(observation), *prev_memories) action, q_eval = res[:2] memories = res[2:] return action, memories, q_eval.max(axis=-1)
def build_model(self): # reshape to [batch, color, x, y] to allow for convolution layers to work correctly observation_reshape = DimshuffleLayer(self.observation_layer, (0, 3, 1, 2)) observation_reshape = Pool2DLayer(observation_reshape, pool_size=(2, 2)) # memory window_size = 5 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window memory_layer = WindowAugmentation(observation_reshape, prev_window, name="new window state") memory_dict = {memory_layer: prev_window} # pixel-wise maximum over the temporal window (to avoid flickering) memory_layer = ExpressionLayer(memory_layer, lambda a: a.max(axis=1), output_shape=(None, ) + memory_layer.output_shape[2:]) # neural network body nn = batch_norm( lasagne.layers.Conv2DLayer(memory_layer, num_filters=16, filter_size=(8, 8), stride=(4, 4))) nn = batch_norm( lasagne.layers.Conv2DLayer(nn, num_filters=32, filter_size=(4, 4), stride=(2, 2))) nn = batch_norm(lasagne.layers.DenseLayer(nn, num_units=256)) # q_eval policy_layer = DenseLayer(nn, num_units=self.n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(policy_layer, name="resolver") # all together agent = Agent(self.observation_layer, memory_dict, policy_layer, resolver) return resolver, agent
def __init__( self, controller, gru0_size=128, ): #image observation at current tick goes here self.observed_state = InputLayer(controller.dnn_output.output_shape, name="cnn output") prev_gru0 = InputLayer((None, gru0_size), name='prev gru0') self.gru0 = GRUCell(prev_state=prev_gru0, input_or_inputs=self.observed_state) memory_dict = {self.gru0: prev_gru0} #q_eval q_eval = DenseLayer(self.gru0, num_units=controller.n_goals, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #resolver self.resolver = EpsilonGreedyResolver(q_eval, name="resolver") #all together self.agent = Agent(self.observed_state, memory_dict, q_eval, [self.resolver, q_eval]) self.controller = controller self.observation_shape = controller.dnn_output.output_shape self.n_goals = controller.n_goals self.period = controller.metacontroller_period self.applier_fun = self.agent.get_react_function() self.weights = lasagne.layers.get_all_params(self.resolver, trainable=True)
def build_agent(action_shape, state_shape): observation_layer = InputLayer((None, *state_shape)) net = DenseLayer(observation_layer, 10, nonlinearity=lasagne.nonlinearities.sigmoid, name='dense1') # net = DenseLayer(net, 256, name='dense2') # a layer that predicts Qvalues policy_layer_flattened = DenseLayer( net, num_units=np.prod(action_shape), nonlinearity=lasagne.nonlinearities.softmax, name="q-evaluator layer") policy_layer = ReshapeLayer(policy_layer_flattened, ([0], *action_shape)) V_layer = DenseLayer(net, 1, nonlinearity=None, name="state values") # Pick actions at random proportionally to te probabilities action_layer = MultiProbabilisticResolver(policy_layer, name="e-greedy action picker", assume_normalized=True) # print("ActionL: ", action_layer.output_shape) # action_layer = ActionEncoder( # action_layer, # base=3) # print("ActionL': ", action_layer.output_shape) # action_layer = T.printing.Print("A")(action_layer) # all together agent = Agent(observation_layers=observation_layer, policy_estimators=(policy_layer_flattened, V_layer), action_layers=action_layer) return agent, action_layer, V_layer
print(get_output_shape(qvalues_layer)) #baseline for all qvalues #qvalues_layer = DenseLayer(dense,n_actions,nonlinearity=None,name='qval') #sample actions proportionally to policy_layer from agentnet.resolver import EpsilonGreedyResolver action_layer = EpsilonGreedyResolver(qvalues_layer) from agentnet.target_network import TargetNetwork targetnet = TargetNetwork(qvalues_layer) qvalues_old = targetnet.output_layers from agentnet.agent import Agent #all together agent = Agent( observation_layers=observation_layer, policy_estimators=(qvalues_layer, qvalues_old), #agent_states={conv_rec:in_conv_rec}, action_layers=action_layer) #Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(qvalues_layer, trainable=True) weights from agentnet.experiments.openai_gym.pool import EnvPool pool = EnvPool(agent, make_env, N_AGENTS) ## %%time ##interact for 7 ticks #_,action_log,reward_log,_,_,_ = pool.interact(10) #
def test_space_invaders( game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None, ) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states window_size = 3 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") memory_dict = {window: prev_window} # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(window_max, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #fakes for a2c policy_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.softmax, name="a2c action probas") state_value_eval = DenseLayer(nn, num_units=1, nonlinearity=None, name="a2c state values") # resolver resolver = ProbabilisticResolver(policy_eval, name="resolver") # agent agent = Agent(observation_layer, memory_dict, (q_eval, policy_eval, state_value_eval), resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [ np.zeros((batch_size, ) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states ] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact( step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values, policy, etc obtained via experience replay _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) (q_values_sequence, policy_sequence, value_sequence) = estimators # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = 0. #1-step algos for algo in qlearning, sarsa: elwise_mse_loss += algo.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) #qlearning_n_step for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1, None): elwise_mse_loss += qlearning_n_step.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=n) #a2c n_step elwise_mse_loss += a2c_n_step.get_elementwise_objective( policy_sequence, value_sequence[:, :, 0], env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=3) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function( [], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % (epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def test_memory( game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None, ) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states memory_dict = OrderedDict([]) ###Window window_size = 3 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) memory_dict[window] = prev_window ###Stack #prev stack stack_w, stack_h = 4, 5 stack_inputs = DenseLayer(observation_reshape, stack_w, name="prev_stack") stack_controls = DenseLayer(observation_reshape, 3, nonlinearity=lasagne.nonlinearities.softmax, name="prev_stack") prev_stack = InputLayer((None, stack_h, stack_w), name="previous stack state") stack = StackAugmentation(stack_inputs, prev_stack, stack_controls) memory_dict[stack] = prev_stack stack_top = lasagne.layers.SliceLayer(stack, 0, 1) ###RNN preset prev_rnn = InputLayer((None, 16), name="previous RNN state") new_rnn = RNNCell(prev_rnn, observation_reshape) memory_dict[new_rnn] = prev_rnn ###GRU preset prev_gru = InputLayer((None, 16), name="previous GRUcell state") new_gru = GRUCell(prev_gru, observation_reshape) memory_dict[new_gru] = prev_gru ###GRUmemorylayer prev_gru1 = InputLayer((None, 15), name="previous GRUcell state") new_gru1 = GRUMemoryLayer(15, observation_reshape, prev_gru1) memory_dict[new_gru1] = prev_gru1 #LSTM with peepholes prev_lstm0_cell = InputLayer( (None, 13), name="previous LSTMCell hidden state [with peepholes]") prev_lstm0_out = InputLayer( (None, 13), name="previous LSTMCell output state [with peepholes]") new_lstm0_cell, new_lstm0_out = LSTMCell( prev_lstm0_cell, prev_lstm0_out, input_or_inputs=observation_reshape, peepholes=True, name="newLSTM1 [with peepholes]") memory_dict[new_lstm0_cell] = prev_lstm0_cell memory_dict[new_lstm0_out] = prev_lstm0_out #LSTM without peepholes prev_lstm1_cell = InputLayer( (None, 14), name="previous LSTMCell hidden state [no peepholes]") prev_lstm1_out = InputLayer( (None, 14), name="previous LSTMCell output state [no peepholes]") new_lstm1_cell, new_lstm1_out = LSTMCell( prev_lstm1_cell, prev_lstm1_out, input_or_inputs=observation_reshape, peepholes=False, name="newLSTM1 [no peepholes]") memory_dict[new_lstm1_cell] = prev_lstm1_cell memory_dict[new_lstm1_out] = prev_lstm1_out ##concat everything for i in [flatten(window_max), stack_top, new_rnn, new_gru, new_gru1]: print(i.output_shape) all_memory = concat([ flatten(window_max), stack_top, new_rnn, new_gru, new_gru1, new_lstm0_out, new_lstm1_out, ]) # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(all_memory, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver") # agent agent = Agent(observation_layer, memory_dict, q_eval, resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [ np.zeros((batch_size, ) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states ] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact( step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values obtained via experience replay _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = qlearning.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function( [], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % (epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
#we need to define the new input map because concatenated_memory is a ConcatLayer and does not have default one def custom_input_map(last_hidden, observation): """just a function that maps memory states to respective inputs""" return { _prev_gru1_layer: last_hidden[:, 0:n_hid_1], _prev_gru2_layer: last_hidden[:, n_hid_1:n_hid_1 + n_hid_2], _observation_layer: observation } #all together agent = Agent(concatenated_memory, q_eval, resolver, input_map=custom_input_map) ##load weights from snapshot snapshot_path = "./demo_stand.qlearning_3_step.epoch60000.pcl" snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1" from agentnet.utils import load if not os.path.isfile(snapshot_path): print "loading snapshot..." if sys.version_info[0] == 2: from urllib import urlretrieve else: from urllib.request import urlretrieve urlretrieve(snapshot_url, snapshot_path)
def __init__(self, observation_shape, n_actions, n_goals=32, metacontroller_period=5, window_size=3, embedding_size=128, ): #image observation at current tick goes here self.observation_layer = InputLayer(observation_shape, name="images input") #reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(self.observation_layer,(0,3,1,2)) observation_reshape = lasagne.layers.Pool2DLayer(observation_reshape,(2,2),mode='average_inc_pad') #prev state input prev_window = InputLayer((None,window_size)+tuple(observation_reshape.output_shape[1:]), name = "previous window state") #our window window = WindowAugmentation(observation_reshape, prev_window, name = "new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None,) + window.output_shape[2:]) memory_dict = {window: prev_window} #a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = batch_norm(Conv2DLayer(window_max,16,filter_size=8,stride=(4,4), name='cnn0')) nn = batch_norm(Conv2DLayer(nn,32,filter_size=4,stride=(2,2), name='cnn1')) nn = batch_norm(Conv2DLayer(nn,64,filter_size=4,stride=(2,2), name='cnn2')) #nn = DropoutLayer(nn,name = "dropout", p=0.05) #will get deterministic during evaluation self.dnn_output = nn = DenseLayer(nn,num_units=256,name='dense1') self.goal_layer = InputLayer((None,), T.ivector(), name='boss goal') self.goal_layer.output_dtype = 'int32' goal_emb = EmbeddingLayer(self.goal_layer, n_goals, embedding_size) nn = lasagne.layers.ConcatLayer([goal_emb,nn]) #q_eval q_eval = DenseLayer(nn, num_units = n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #resolver self.resolver = EpsilonGreedyResolver(q_eval,name="resolver") #all together self.agent = Agent([self.observation_layer,self.goal_layer], memory_dict, q_eval, [self.resolver,self.dnn_output]) self.observation_shape = observation_shape self.n_actions = n_actions self.n_goals = n_goals self.metacontroller_period = metacontroller_period self.window_size = window_size self.embedding_size = embedding_size self.applier_fun = self.agent.get_react_function() self.weights = lasagne.layers.get_all_params(self.resolver,trainable=True)
def test_space_invaders(game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None,) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states window_size = 3 # prev state input prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") memory_dict = {window: prev_window} # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None,) + window.output_shape[2:]) # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(window_max, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #fakes for a2c policy_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.softmax, name="a2c action probas") state_value_eval = DenseLayer(nn, num_units=1, nonlinearity=None, name="a2c state values") # resolver resolver = ProbabilisticResolver(policy_eval, name="resolver") # agent agent = Agent(observation_layer, memory_dict, (q_eval,policy_eval,state_value_eval), resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values, policy, etc obtained via experience replay _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) (q_values_sequence,policy_sequence,value_sequence) = estimators # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = 0. #1-step algos for algo in qlearning,sarsa: elwise_mse_loss += algo.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) #qlearning_n_step for n in (1,3,replay_seq_len-1, replay_seq_len, replay_seq_len+1,None): elwise_mse_loss += qlearning_n_step.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=n) #a2c n_step elwise_mse_loss += a2c_n_step.get_elementwise_objective(policy_sequence, value_sequence[:,:,0], env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=3) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % ( epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
resolver = EpsilonGreedyResolver(q_eval,epsilon=epsilon,name="resolver") #we need to define the new input map because concatenated_memory is a ConcatLayer and does not have default one def custom_input_map(last_hidden,observation): """just a function that maps memory states to respective inputs""" return { _prev_gru1_layer:last_hidden[:,0:n_hid_1], _prev_gru2_layer:last_hidden[:,n_hid_1:n_hid_1+n_hid_2], _observation_layer:observation } #all together agent = Agent(concatenated_memory,q_eval,resolver,input_map=custom_input_map ) ##load weights from snapshot snapshot_path ="./demo_stand.qlearning_3_step.epoch60000.pcl" snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1" from agentnet.utils import load if not os.path.isfile(snapshot_path): print "loading snapshot..." if sys.version_info[0] == 2: from urllib import urlretrieve else:
def test_memory(game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None,) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states memory_dict = OrderedDict([]) ###Window window_size = 3 # prev state input prev_window = InputLayer((None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None,) + window.output_shape[2:]) memory_dict[window] = prev_window ###Stack #prev stack stack_w,stack_h = 4, 5 stack_inputs = DenseLayer(observation_reshape,stack_w,name="prev_stack") stack_controls = DenseLayer(observation_reshape,3, nonlinearity=lasagne.nonlinearities.softmax, name="prev_stack") prev_stack = InputLayer((None,stack_h,stack_w), name="previous stack state") stack = StackAugmentation(stack_inputs,prev_stack, stack_controls) memory_dict[stack] = prev_stack stack_top = lasagne.layers.SliceLayer(stack,0,1) ###RNN preset prev_rnn = InputLayer((None,16), name="previous RNN state") new_rnn = RNNCell(prev_rnn,observation_reshape) memory_dict[new_rnn] = prev_rnn ###GRU preset prev_gru = InputLayer((None,16), name="previous GRUcell state") new_gru = GRUCell(prev_gru,observation_reshape) memory_dict[new_gru] = prev_gru ###GRUmemorylayer prev_gru1 = InputLayer((None,15), name="previous GRUcell state") new_gru1 = GRUMemoryLayer(15,observation_reshape,prev_gru1) memory_dict[new_gru1] = prev_gru1 #LSTM with peepholes prev_lstm0_cell = InputLayer((None,13), name="previous LSTMCell hidden state [with peepholes]") prev_lstm0_out = InputLayer((None,13), name="previous LSTMCell output state [with peepholes]") new_lstm0_cell,new_lstm0_out = LSTMCell(prev_lstm0_cell,prev_lstm0_out, input_or_inputs = observation_reshape, peepholes=True,name="newLSTM1 [with peepholes]") memory_dict[new_lstm0_cell] = prev_lstm0_cell memory_dict[new_lstm0_out] = prev_lstm0_out #LSTM without peepholes prev_lstm1_cell = InputLayer((None,14), name="previous LSTMCell hidden state [no peepholes]") prev_lstm1_out = InputLayer((None,14), name="previous LSTMCell output state [no peepholes]") new_lstm1_cell,new_lstm1_out = LSTMCell(prev_lstm1_cell,prev_lstm1_out, input_or_inputs = observation_reshape, peepholes=False,name="newLSTM1 [no peepholes]") memory_dict[new_lstm1_cell] = prev_lstm1_cell memory_dict[new_lstm1_out] = prev_lstm1_out ##concat everything for i in [flatten(window_max),stack_top,new_rnn,new_gru,new_gru1]: print(i.output_shape) all_memory = concat([flatten(window_max),stack_top,new_rnn,new_gru,new_gru1,new_lstm0_out,new_lstm1_out,]) # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(all_memory, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver") # agent agent = Agent(observation_layer, memory_dict, q_eval, resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [np.zeros((batch_size,) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values obtained via experience replay _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = qlearning.get_elementwise_objective(q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10 ** -4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function([], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % ( epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def test_reasoning_value_based(n_parallel_games=25, algo = qlearning, n_steps=1 ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param algo: training algorithm to use (module) """ # instantiate an experiment environment with default parameters env = experiment.BooleanReasoningEnvironment() # hidden neurons n_hidden_neurons = 64 observation_size = (None,) + tuple(env.observation_shapes) observation_layer = lasagne.layers.InputLayer(observation_size, name="observation_input") prev_state_layer = lasagne.layers.InputLayer([None, n_hidden_neurons], name="prev_state_input") # memory layer (this isn't the same as lasagne recurrent units) rnn = RNNCell(prev_state_layer, observation_layer, name="rnn0") # q_values (estimated using very simple neural network) q_values = lasagne.layers.DenseLayer(rnn, num_units=env.n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver uses epsilon - parameter which defines a probability of randomly taken action. epsilon = theano.shared(np.float32(0.1), name="e-greedy.epsilon") resolver = EpsilonGreedyResolver(q_values, epsilon=epsilon, name="resolver") # packing this into agent agent = Agent(observation_layer, agent_states={rnn:prev_state_layer}, policy_estimators=q_values, action_layers=resolver) # Since it's a lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver,trainable=True) # produce interaction sequences of length <= 10 (state_seq,), observation_seq, agent_state, action_seq, qvalues_seq = agent.get_sessions( env, session_length=10, batch_size=env.batch_size, ) hidden_seq = agent_state[rnn] # get rewards for all actions rewards_seq = env.get_reward_sequences(state_seq, action_seq) # get indicator whether session is still active is_alive_seq = env.get_whether_alive(observation_seq) # gamma - delayed reward coefficient - what fraction of reward is retained if it is obtained one tick later gamma = theano.shared(np.float32(0.99), name='q_learning_gamma') squarred_Qerror = algo.get_elementwise_objective( qvalues_seq, action_seq, rewards_seq, is_alive_seq, gamma_or_gammas=gamma) # take sum over steps, average over sessions mse_Qloss = squarred_Qerror.sum(axis=1).mean() # impose l2 regularization on network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-3 loss = mse_Qloss + reg_l2 # compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.1) # take sum over steps, average over sessions mean_session_reward = rewards_seq.sum(axis=1).mean() train_fun = theano.function([], [loss, mean_session_reward], updates=updates) compute_mean_session_reward = theano.function([], mean_session_reward) score_log = Metrics() for epoch in range(5000): # update resolver's epsilon (chance of random action instead of optimal one) # epsilon decreases over time current_epsilon = 0.05 + 0.95 * np.exp(-epoch / 2500.) resolver.epsilon.set_value(np.float32(current_epsilon)) # train env.generate_new_data_batch(n_parallel_games) loss, avg_reward = train_fun() # show current learning progress if epoch % 100 == 0: print(epoch), # estimate reward for epsilon-greedy strategy avg_reward_current = compute_mean_session_reward() score_log["expected epsilon-greedy reward"][epoch] = avg_reward_current # estimating the reward under assumption of greedy strategy resolver.epsilon.set_value(0) avg_reward_greedy = compute_mean_session_reward() score_log["expected greedy reward"][epoch] = avg_reward_greedy if avg_reward_greedy > 2: print("converged") break else: print("diverged") raise ValueError("Algorithm diverged")
def __init__( self, observation_shape, n_actions, n_goals=32, metacontroller_period=5, window_size=3, embedding_size=128, ): #image observation at current tick goes here self.observation_layer = InputLayer(observation_shape, name="images input") #reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(self.observation_layer, (0, 3, 1, 2)) observation_reshape = lasagne.layers.Pool2DLayer( observation_reshape, (2, 2), mode='average_inc_pad') #prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") #our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) memory_dict = {window: prev_window} #a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = batch_norm( Conv2DLayer(window_max, 16, filter_size=8, stride=(4, 4), name='cnn0')) nn = batch_norm( Conv2DLayer(nn, 32, filter_size=4, stride=(2, 2), name='cnn1')) nn = batch_norm( Conv2DLayer(nn, 64, filter_size=4, stride=(2, 2), name='cnn2')) #nn = DropoutLayer(nn,name = "dropout", p=0.05) #will get deterministic during evaluation self.dnn_output = nn = DenseLayer(nn, num_units=256, name='dense1') self.goal_layer = InputLayer((None, ), T.ivector(), name='boss goal') self.goal_layer.output_dtype = 'int32' goal_emb = EmbeddingLayer(self.goal_layer, n_goals, embedding_size) nn = lasagne.layers.ConcatLayer([goal_emb, nn]) #q_eval q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #resolver self.resolver = EpsilonGreedyResolver(q_eval, name="resolver") #all together self.agent = Agent([self.observation_layer, self.goal_layer], memory_dict, q_eval, [self.resolver, self.dnn_output]) self.observation_shape = observation_shape self.n_actions = n_actions self.n_goals = n_goals self.metacontroller_period = metacontroller_period self.window_size = window_size self.embedding_size = embedding_size self.applier_fun = self.agent.get_react_function() self.weights = lasagne.layers.get_all_params(self.resolver, trainable=True)
#Replacing with gru1 or gru2 would mean taking one num_units = n_actions, nonlinearity=lasagne.nonlinearities.linear,name="QEvaluator") #resolver epsilon = theano.shared(np.float32(0.0),"e-greedy.epsilon") resolver = EpsilonGreedyResolver(q_eval,epsilon=epsilon,name="resolver") from collections import OrderedDict #all together agent = Agent(_observation_layer, OrderedDict([ (gru1,_prev_gru1_layer), (gru2,_prev_gru2_layer) ]), [q_eval,concatenated_memory],resolver) ##load weights from snapshot snapshot_path ="./demo_stand.qlearning_3_step.epoch60000.pcl" snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1" from agentnet.utils import load if not os.path.isfile(snapshot_path): print "loading snapshot..."
n_actions = len(feature_names) q_eval = lasagne.layers.DenseLayer( concatenated_memory, #taking both memories. #Replacing with gru1 or gru2 would mean taking one num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #resolver epsilon = theano.shared(np.float32(0.0), "e-greedy.epsilon") resolver = EpsilonGreedyResolver(q_eval, epsilon=epsilon, name="resolver") from collections import OrderedDict #all together agent = Agent( _observation_layer, OrderedDict([(gru1, _prev_gru1_layer), (gru2, _prev_gru2_layer)]), [q_eval, concatenated_memory], resolver) ##load weights from snapshot snapshot_path = "./demo_stand.qlearning_3_step.epoch60000.pcl" snapshot_url = "https://www.dropbox.com/s/vz4hz5tpm0u2zkw/demo_stand.qlearning_3_step.epoch60000.pcl?dl=1" from agentnet.utils import load if not os.path.isfile(snapshot_path): print "loading snapshot..." if sys.version_info[0] == 2: from urllib import urlretrieve else: from urllib.request import urlretrieve urlretrieve(snapshot_url, snapshot_path)