def __init__(self, state_shape, num_actions, epsilon=1.0, epsilon_min=0.1, epsilon_iter=100000, discount=0.99, lrate=1e-3, batch_size=100, q_update_iter=1000, capacity=50000): if not isinstance(state_shape, tuple): raise AssertionError('state_shape must be of type <tuple>.') elif len(state_shape) == 0: raise AssertionError('No state space dimensions provided.') elif num_actions == 0: raise ValueError('Number of actions must be > 0.') elif epsilon_min is not None: assert epsilon_min < epsilon, 'Epsilon(min) must be < epsilon(max).' elif capacity < batch_size: raise ValueError('Replay capacity must be > batch_size.') self.state_shape = state_shape self.num_actions = num_actions self.q_network = build_network(state_shape, num_actions) self.q_targets = build_network(state_shape, num_actions) self.epsilon = epsilon self.epsilon_max = epsilon # How greedy the policy is self.epsilon_min = epsilon_min self.epsilon_iter = float(epsilon_iter) self.discount = discount self.lr = lrate self.batch_size = batch_size # How many samples to draw from buffer self.q_update_iter = q_update_iter # Update the q_target every C iter self.step = 0 self.replay_buffer = ReplayBuffer(capacity, state_shape) # Build training and sampling functions s0_sym = nn.get_all_layers(self.q_network)[0].input_var s1_sym = nn.get_all_layers(self.q_targets)[0].input_var a_sym = T.icol('actions') #(n, 1) r_sym = T.col('rewards') t_sym = T.col('terminal_state') sym_vars = [s0_sym, a_sym, r_sym, s1_sym, t_sym] # Training phase uses non-deterministic mapping loss = T.sum(self._build_loss(*sym_vars, deterministic=False)) params = nn.get_all_params(self.q_network, trainable=True) updates = lasagne.updates.adam(loss, params, self.lr, beta1=0.9) self.train_fn = theano.function(sym_vars, loss, updates=updates) # Build function for sampling from DQN pred = nn.get_output(self.q_network, deterministic=True) self.pred_fn = theano.function([s0_sym], pred)
def __init__(self, state_shape, num_actions, action_scale, lr, tau): self.state_shape = state_shape self.num_actions = num_actions self.action_scale = action_scale self.tau = tau # Build networks, then initialize their weights to be equal sym_s0 = get_symbolic_var(state_shape)('s0') sym_s1 = get_symbolic_var(state_shape)('s1') self.network = self._build_network(sym_s0) self.targets = self._build_network(sym_s1) self.update_target(tau=1.0) # For making predictions via current and target networks a_pred = nn.get_output(self.network) self.predict_fn = theano.function([sym_s0], a_pred) self.target_fn = theano.function([sym_s1], nn.get_output(self.targets)) # The policy is updated by following gradients from critic network. # In theano, this is done by specifying the 'known_gradients' parameter # in T.grad, without giving an explicit scalar cost action_grads = T.col('action_grads') known_grads = {a_pred: action_grads} params = nn.get_all_params(self.network, trainable=True) grads = [-T.grad(None, p, known_grads=known_grads) for p in params] updates = lasagne.updates.adam(grads, params, lr) train = theano.function([sym_s0, action_grads], grads, updates=updates) self.train_fn = train
def test_broadcast_arguments(self): m = Module() m.random = RandomStreams(utt.fetch_seed()) low = tensor.vector() high = tensor.col() out = m.random.uniform(low=low, high=high) assert out.ndim == 2 m.f = Method([low, high], out) made = m.make() made.random.initialize() rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30) numpy_rng = numpy.random.RandomState(int(rng_seed)) low_vals = [ numpy.asarray([-5, .5, 0, 1], dtype=config.floatX), numpy.asarray([.9], dtype=config.floatX), numpy.asarray([-5, .5, 0, 1], dtype=config.floatX) ] high_vals = [ numpy.asarray([[1.]], dtype=config.floatX), numpy.asarray([[1.], [1.1], [1.5]], dtype=config.floatX), numpy.asarray([[1.], [1.1], [1.5]], dtype=config.floatX) ] val0 = made.f(low_vals[0], high_vals[0]) val1 = made.f(low_vals[1], high_vals[1]) val2 = made.f(low_vals[2], high_vals[2]) numpy_val0 = numpy_rng.uniform(low=low_vals[0], high=high_vals[0]) numpy_val1 = numpy_rng.uniform(low=low_vals[1], high=high_vals[1]) numpy_val2 = numpy_rng.uniform(low=low_vals[2], high=high_vals[2]) assert numpy.allclose(val0, numpy_val0) assert numpy.allclose(val1, numpy_val1) assert numpy.allclose(val2, numpy_val2)
def get_loss_sarsa_function(self): #args self.states = T.matrix('state') self.actions = T.icol('action') self.next_states = T.matrix('next_state') self.next_actions = T.icol('next_action') self.rewards = T.col('reward') #q(s,a) actionmask = T.eq( T.arange(self.nactions).reshape((1, -1)), self.actions.reshape((-1, 1))).astype(theano.config.floatX) q_action = (get_output(self.network, self.states) * actionmask).sum(axis=1).reshape((-1, 1)) #q(s_next,a_next) next_actionmask = T.eq( T.arange(self.nactions).reshape((1, -1)), self.next_actions.reshape((-1, 1))).astype(theano.config.floatX) next_q_action = (get_output(self.network, self.next_states) * next_actionmask).sum(axis=1).reshape((-1, 1)) #loss = target - qvalue loss = (self.rewards + self.discount * next_q_action - q_action) #mse mse = 0.5 * loss**2 #sum loss return T.sum(mse)
def __init__(self, args): reward = T.col('r') action = T.icol('a') terminal = T.icol('t') discount = T.scalar('gamma') learningRate = T.scalar('lr') rho = T.scalar('rho') epsilon = T.scalar('eps') rng = np.random.RandomState(42) self.batchNb = args.batchSize #convLayers = [[(8,8),(4,4),64], # [(4,4),(2,2),128], # [(3,3),(1,1),256], # [(3,3),(1,1),512]] #fcl = [1024, 6] convLayers = [[(8,8),(4,4),64], [(4,4),(2,2),128], [(3,3),(1,1),256], [(3,3),(1,1),256]] fcl = [1024, args.actionNb] self.q1 = NetStruct(convLayers, fcl, (4,100,100), rng, args) self.q2 = NetStruct(convLayers, fcl, (4,100,100), rng, args) self.q2.setParams(self.q1) self.states = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32')) self.states2 = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32')) self.actions = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True)) self.rewards = theano.shared(np.zeros((args.batchSize,1), dtype='float32'), broadcastable=(False,True)) self.terminals = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True)) self.learningRate = theano.shared(np.array(args.learningRate, dtype='float32')) self.rho = theano.shared(np.array(args.rmsPropRho, dtype='float32')) self.epsilon = theano.shared(np.array(args.rmsPropEpsilon, dtype='float32')) self.discount = theano.shared(np.array(args.discountFactor, dtype='float32')) loss = self.QLoss(self.q1.output, self.q2.output, action, reward, terminal, discount) params = self.q1.getParams() updates = self.rmsProp(loss, params, rho, epsilon, learningRate) self.train_model = theano.function( [], loss, updates=updates, givens = { self.q1.input: self.states, self.q2.input: self.states2, action: self.actions, reward: self.rewards, terminal: self.terminals, discount: self.discount, learningRate: self.learningRate, rho: self.rho, epsilon: self.epsilon } )
def createGradientFunctions(self): #Create the Theano variables W1,W2,W3,W4,W5,W6,x,eps = T.dmatrices("W1","W2","W3","W4","W5","W6","x","eps") #Create biases as cols so they can be broadcasted for minibatches b1,b2,b3,b4,b5,b6 = T.dcols("b1","b2","b3","b4","b5","b6") z1 = T.col("z1") if self.continuous: #convolve x # no_filters = 100, stride = 4, filter_size = 50 h_encoder = T.tanh(T.dot(W1,x) + b1) #h_encoder = T.dot(W1,x) + b1 else: h_encoder = T.tanh(T.dot(W1,x) + b1) mu_encoder = T.dot(W2,h_encoder) + b2 log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3) mu_encoder = T.dot(W2,h_encoder) + b2 log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3) #Find the hidden variable z z = mu_encoder + T.exp(log_sigma_encoder)*eps prior = 0.5* T.sum(1 + 2*log_sigma_encoder - mu_encoder**2 - T.exp(2*log_sigma_encoder)) #Set up decoding layer if self.continuous: h_decoder = T.nnet.softplus(T.dot(W4,z) + b4) h_dec = T.nnet.softplus(T.dot(W4,z1) + b4) #h_decoder = T.dot(W4,z) + b4 #h_dec = T.dot(W4,z1) + b4 mu_decoder = T.tanh(T.dot(W5,h_decoder) + b5) mu_dec = T.tanh(T.dot(W5,h_dec) + b5) log_sigma_decoder = 0.5*(T.dot(W6,h_decoder) + b6) logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2) gradvariables = [W1,W2,W3,W4,W5,W6,b1,b2,b3,b4,b5,b6] else: h_decoder = T.tanh(T.dot(W4,z) + b4) y = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5) logpxz = -T.nnet.binary_crossentropy(y,x).sum() gradvariables = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5] logp = logpxz + prior #Compute all the gradients derivatives = T.grad(logp,gradvariables) #Add the lowerbound so we can keep track of results derivatives.append(logp) self.get_z = th.function(gradvariables+[x,eps],z,on_unused_input='ignore') self.generate = th.function(gradvariables+[z1,x,eps],mu_dec,on_unused_input='ignore') self.predict = th.function(gradvariables+[x,eps],mu_decoder,on_unused_input='ignore') self.gradientfunction = th.function(gradvariables + [x,eps], derivatives, on_unused_input='ignore') self.lowerboundfunction = th.function(gradvariables + [x,eps], logp, on_unused_input='ignore')
def test_ndim_mismatch(self): rng = numpy.random.RandomState(utt.fetch_seed()) data = rng.rand(5).astype(self.dtype) x = self.shared(data) y = tensor.col('y', self.dtype) cond = theano.tensor.iscalar('cond') self.assertRaises(TypeError, ifelse, cond, x, y) self.assertRaises(TypeError, ifelse, cond, y, x)
def testDataSet(self, dataSet_, dataLabels_): dataSet = T.matrix("dataSet") labels = T.col("labels") svLabels = T.col("svLabels") gamma = T.dscalar("gamma") svs = T.matrix("supportVectors") svAlphas = T.matrix("svAlphas") b = T.dscalar("b") # we need to transpose the result because the results of the per-row actions are usually columns errorVec = theano.scan(lambda row, realLabel : self.testDataSet_inner_(svs, row, gamma, svLabels, svAlphas, b, realLabel), sequences=[dataSet, labels])[0] errors = T.sum(errorVec) inputs = [dataSet, labels, svs, svLabels, gamma, svAlphas, b] compErrors = theano.function(inputs=inputs, outputs=errors, on_unused_input='ignore') gamma_ = 1/(-1*self.Training.UsedKernel[1]**2) numErrors = compErrors(dataSet_, dataLabels_, self.Training.SupportVectors, self.Training.SVLabels, gamma_, self.Training.Alphas[self.Training.SVIndices], self.Training.B.item(0)) return float(numErrors) / float(dataSet_.shape[0])
def __init__(self): self.dt=1 self.xdim=1 self.udim=1 self.r=1 self.delay=T.bscalar() self.delay2=T.bscalar() self.x=T.matrix() self.u=T.col() self.xu_flat=T.concatenate([T.flatten(self.x.T),T.flatten(self.u)])
def __init__(self, numpy_rng, theano_rng = None, first_layer_type = 'bernoulli', mean_doc_size = 1, n_ins = 784, mid_layer_sizes=[200], inner_code_length = 10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input (and autoencoder output, y) of the SMH :type n_code_length: int :param n_code_length: how many codes to squash down to in the middle layer """ self.first_layer_type = first_layer_type; self.mean_doc_size = mean_doc_size; self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_ins = n_ins self.inner_code_length = inner_code_length self.mid_layer_sizes = list(mid_layer_sizes) self.numpy_rng = numpy_rng self.theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data if (theano.config.floatX == "float32"): self.x = T.matrix('x') # self.x_sums = T.col('x_sums') self.y = T.matrix('y') # the output (after finetuning) should /look the same as the input else: if (theano.config.floatX == "float64"): self.x = T.dmatrix('x') # self.x_sums = T.dcol('x_sums') self.y = T.dmatrix('y') # the output (after finetuning) should look the same as the input else: raise Exception #not sure whats up here.. # The SMH is an MLP, for which all weights of intermediate layers are shared with a # different RBM. We will first construct the SMH as a deep multilayer perceptron, and # when constructing each sigmoidal layer we also construct an RBM that shares weights # with that layer. During pretraining we will train these RBMs (which will lead # to chainging the weights of the MLP as well) During finetuning we will finish # training the SMH by doing stochastic gradient descent on the MLP. self.init_layers()
def __init__(self, input_width, input_height, output_dim, num_frames, batch_size): self.input_width = input_width self.input_height = input_height self.output_dim = output_dim self.num_frames = num_frames self.batch_size = batch_size self.gamma = 0.99 # discount factor self.rho = 0.99 self.lr = 0.00025 # learning rate self.momentum = 0.95 self.freeze_targets = True self.l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size) if self.freeze_targets: self.next_l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # terminals = T.icol('terminals') self.states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False,True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True)) # self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True)) q_vals = self.l_out.get_output(states / 255.0) if self.freeze_targets: next_q_vals = self.next_l_out.get_output(next_states / 255.0) else: next_q_vals = self.l_out.get_output(next_states / 255.0) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = rewards + self.gamma * T.max(next_q_vals, axis=1, keepdims=True) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1,1)) loss = T.mean(diff ** 2) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, # terminals: self.terminals_shared } if self.momentum > 0: updates = rmsprop_nesterov(loss, params, self.lr, self.rho, self.momentum, 1e-2) else: updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 1e-6) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={ states: self.states_shared })
def test_ndim_mismatch(self): rng = np.random.RandomState(utt.fetch_seed()) data = rng.rand(5).astype(self.dtype) x = self.shared(data) y = tensor.col("y", self.dtype) cond = theano.tensor.iscalar("cond") with pytest.raises(TypeError): ifelse(cond, x, y) with pytest.raises(TypeError): ifelse(cond, y, x)
def neural_tensor_network(): # tensor params subj = T.col('e_1') targets = T.matrix('e_2') W = T.tensor3('W') # neural net params u = T.col('u') V = T.matrix('V') b = T.col('b') # tensor h = subj.T.dot(W).dot(targets) # neural net d = subj.shape[0] V_subj = V[:, :d].dot(subj) V_targ = V[:, d:].dot(targets) activations = T.tanh(h + V_subj + V_targ + b) score = u.T.dot(activations).reshape((-1, 1)) margins = score[0] - score[1:] cost = T.min(T.concatenate((T.ones_like(margins), margins), axis=1), axis=1).mean() gsubj, gtargets, gW, gu, gV, gb = T.grad(cost, [subj, targets, W, u, V, b]) print 'Compiling NTN score' score = theano.function([subj, W, targets, u, V, b], score, name='NTN Score', mode='FAST_RUN') print 'Compiling NTN fprop' fprop = theano.function([subj, W, targets, u, V, b], cost, name='NTN fprop', mode='FAST_RUN') print 'Compiling NTN bprop' bprop = theano.function([subj, W, targets, u, V, b], outputs=[gsubj, gW, gtargets, gu, gV, gb], name='NTN bprop', mode='FAST_RUN') return {'score': score, 'fprop': fprop, 'bprop': bprop}
def build_finetune_functions(self, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' train_set_x = T.matrix('train_set_x') train_set_x_sums = T.col('train_set_x_sums') valid_set_x = T.matrix('valid_set_x') valid_set_x_sums = T.col('valid_set_x_sums') test_set_x = T.matrix('test_set_x') test_set_x_sums = T.col('test_set_x_sums') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for param, gparam in zip(self.params, gparams): updates[param] = param - gparam*learning_rate train_fn = theano.function(inputs = [train_set_x, train_set_x_sums], outputs = self.finetune_cost, updates = updates, givens = { self.x : train_set_x, self.x_sums : train_set_x_sums }) valid_score_i = theano.function([valid_set_x, valid_set_x_sums], self.finetune_cost, givens = { self.x : valid_set_x, self.x_sums : valid_set_x_sums }) test_score_i = theano.function([test_set_x, test_set_x_sums], self.finetune_cost, givens = { self.x : test_set_x, self.x_sums : test_set_x_sums }) return train_fn, valid_score_i, test_score_i
def setup_theano(self): # for numpy optimization oneCol = T.col("oneCol") pi_t = T.col("pi_t") z_t = T.col("z_t") z_t1 = z_t.reshape((self.numKeypoints, 2)) pts = T.concatenate((z_t1, oneCol), axis=1) A_t_ = T.matrix("A_t_") r_t_ = T.dot(A_t_, pts.transpose()).transpose() r_t1_ = r_t_[:, 0:2].reshape((2 * self.numKeypoints, 1)) diff_ = pi_t * (r_t1_ - self.mu) difft_ = diff_.reshape((1, 2 * self.numKeypoints)) cost_1 = T.dot(difft_, diff_) # cost_1 = theano.printing.Print('cost is:')(cost_1) cost_ = T.max(cost_1) A_t_grad_ = T.grad(cost=cost_, wrt=A_t_) A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2, :], 0) self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol], outputs=[cost_, A_t_grad_])
def __init__(self, state_shape, num_actions, discount, lr, tau, l2_decay): self.state_shape = state_shape self.num_actions = num_actions self.discount = discount self.tau = tau # Initialize some symbolic variables to interface with graphs easier sym_s0 = get_symbolic_var(state_shape)('s0') sym_a0 = T.col('policy_actions') sym_s1 = get_symbolic_var(state_shape)('s1') sym_a1 = T.col('target_actions') sym_r = T.col('rewards') sym_t = T.col('terminal_state') sym_vars = [sym_s0, sym_a0, sym_s1, sym_a1, sym_r, sym_t] self.network = self._build_network(sym_s0, sym_a0) self.targets = self._build_network(sym_s1, sym_a1) self.update_target(tau=1.0) # Functions for sampling from current and target Q-functions q_pred = nn.get_output(self.network) q_target = nn.get_output(self.targets) self.predict_fn = theano.function([sym_s0, sym_a0], q_pred) self.target_fn = theano.function([sym_s1, sym_a1], q_target) # Calculate action gradients for updating actor / policy grads = T.grad(T.mean(q_pred), sym_a0) self.action_grads = theano.function([sym_s0, sym_a0], grads) # Build critic training function; loss is similar to DQN, where # it's the mean squared error between Q and target Q values yi = sym_r + (1. - sym_t) * self.discount * q_target loss = T.mean(T.sqr(yi - q_pred)) loss += regularize_network_params(self.network, l2) * l2_decay params = nn.get_all_params(self.network, trainable=True) updates = lasagne.updates.adam(loss, params, lr) self.train_fn = theano.function(sym_vars, loss, updates=updates)
def setup_theano(self): #for numpy optimization oneCol = T.col('oneCol') pi_t = T.col('pi_t') z_t = T.col('z_t') z_t1 = z_t.reshape((self.numKeypoints, 2)) pts = T.concatenate((z_t1, oneCol), axis=1) A_t_ = T.matrix('A_t_') r_t_ = T.dot(A_t_, pts.transpose()).transpose() r_t1_ = r_t_[:,0:2].reshape((2*self.numKeypoints,1)) diff_ = pi_t * (r_t1_ - self.mu) difft_ = diff_.reshape((1, 2 * self.numKeypoints)) cost_1 = T.dot(difft_,diff_) #cost_1 = theano.printing.Print('cost is:')(cost_1) cost_ = T.max(cost_1) A_t_grad_ = T.grad(cost=cost_, wrt=A_t_) A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2,:],0) self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol], outputs=[cost_, A_t_grad_])
def pretraining_functions(self, batch_size, method, pretrain_lr, k): ''' Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input a minibatch of data, and to train an RBM you just need to iterate, calling the corresponding function on all minibatches. :type batch_size: int :param batch_size: size of a [mini]batch :type method: string :param method: type of Gibbs sampling to perform: 'cd' (default) or 'pcd' :type k: int :param k: number of Gibbs steps to do in CD-k / PCD-k ;type finetune_lr: float ;param finetune_lr: the 'learning rate' to use during finetuning phase ''' learning_rate = T.scalar('lr') # learning rate to use #learning_rate.value = pretrain_lr # i *think* the following is equivalent to above.. doing this because i can't see where lr gets a value at all #learning_rate = theano.shared(pretrain_lr, 'learning_rate') train_set_x = T.matrix('train_set_x') train_set_x_sums = T.col('train_set_x_sums') pretrain_fns = [] for rbm in self.rbm_layers: if method == 'pcd': # initialize storage for the persistent chain (state = hidden layer of chain) persistent_chain = theano.shared(numpy.zeros((batch_size,rbm.n_hidden),dtype=theano.config.floatX)) # get the cost and the gradient corresponding to one step of PCD-k cost,updates = rbm.get_cost_updates(lr=learning_rate, persistent=persistent_chain, k=k) else: # default = use CD instead cost,updates = rbm.get_cost_updates(lr=learning_rate) # compile the theano function fn = theano.function(inputs = [train_set_x,train_set_x_sums, theano.Param(learning_rate, default = 0.1)], outputs = cost, updates = updates, givens = {self.x:train_set_x, self.x_sums:train_set_x_sums} # uncomment the following line to perform debugging: # ,mode=theano.compile.debugmode.DebugMode(stability_patience=5) ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns
def transE_model(): ''' Note X_S is a column and X_T is a matrix so that broadcasting occurs across the columns of X_T (this allows batching X_T with negatives, for example. ''' # construct theano expression graph X_s = T.col('X_s') W = T.matrix('W') X_t = T.matrix('X_t') rels = W[:, :, None].transpose(1, 0, 2) # Computes x_{r_1} + x_{r_{2}} + ... + x_{r_n} - X_{t} results, updates = theano.scan(fn=lambda rel, v: rel + v, outputs_info=-X_t, sequences=[rels]) # score is always a column vector score = T.sum((X_s + results[-1]) ** 2, axis=0).reshape((-1, 1)) margins = 1. + score[0] - score[1:] # zero out negative entries pos_parts = margins * (margins > 0) # we are using online Maximizer, so the objective is negated cost = -pos_parts.mean() gX_s, gW, gX_t = T.grad(cost, [X_s, W, X_t]) print 'Compiling TransE score' # return negative score since this is a ranking score = theano.function([X_s, W, X_t], -score, name='transE Score', mode='FAST_RUN') score.trust_input = True print 'Compiling TransE fprop' fprop = theano.function([X_s, W, X_t], cost, name='transE fprop', mode='FAST_RUN') fprop.trust_input = True print 'Compiling TransE bprop' bprop = theano.function([X_s, W, X_t], outputs=[gX_s, gW, gX_t], name='transE bprop', mode='FAST_RUN') bprop.trust_input = True return {'score': score, 'fprop': fprop, 'bprop': bprop}
def test_wrong_broadcast(self): a = tt.col() increment = tt.vector() # These symbolic graphs legitimate, as long as increment has exactly # one element. So it should fail at runtime, not at compile time. rng = numpy.random.RandomState(utt.fetch_seed()) def rng_randX(*shape): return rng.rand(*shape).astype(theano.config.floatX) for op in (tt.set_subtensor, tt.inc_subtensor): for base in (a[:], a[0]): out = op(base, increment) f = theano.function([a, increment], out) # This one should work f(rng_randX(3, 1), rng_randX(1)) # These ones should not self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(2)) self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(3)) self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(0))
def __init__(self, env, args, rng, name = "DQNLasagne"): """ Initializes a network based on the Lasagne Theano framework. Args: env (AtariEnv): The envirnoment in which the agent actuates. args (argparse.Namespace): All settings either with a default value or set via command line arguments. rng (mtrand.RandomState): Initialized Mersenne Twister pseudo-random number generator. name (str): The name of the network object. Note: This function should always call the base class first to initialize the common values for the networks. """ _logger.info("Initialize object of type " + str(type(self).__name__)) super(DQNLasagne, self).__init__(env, args, rng, name) self.input_shape = (self.batch_size, self.sequence_length, args.frame_width, args.frame_height) self.dummy_batch = np.zeros(self.input_shape, dtype=np.uint8) lasagne.random.set_rng(self.rng) self.network = self._create_layer() # TODO: Load weights from pretrained network?! if not self.args.load_weights == None: self.load_weights(self.args.load_weights) if self.target_update_frequency > 0: self.target_network = self._create_layer() self._copy_theta() states = T.tensor4('states') followup_states = T.tensor4('followup_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros(self.input_shape, dtype=theano.config.floatX) ) self.followup_states_shared = theano.shared( np.zeros(self.input_shape, dtype=theano.config.floatX) ) self.rewards_shared = theano.shared( np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True) ) self.actions_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True) ) self.terminals_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True) ) qvalues = lasagne.layers.get_output( self.network, self._prepare_network_input(states) ) if self.target_update_frequency > 0: qvalues_followup_states = lasagne.layers.get_output( self.target_network, self._prepare_network_input(followup_states) ) else: qvalues_followup_states = lasagne.layers.get_output( self.network, self._prepare_network_input(followup_states) ) qvalues_followup_states = theano.gradient.disconnected_grad(qvalues_followup_states) targets = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(qvalues_followup_states, axis=1, keepdims=True) ) errors = targets - qvalues[ T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_error > 0: quadratic_part = T.minimum(abs(errors), self.clip_error) linear_part = abs(errors) - quadratic_part cost_function = T.sum(0.5 * quadratic_part ** 2 + self.clip_error * linear_part) else: cost_function = T.sum(0.5 * errors ** 2) self.params = lasagne.layers.helper.get_all_params(self.network) self.observations = { states: self.states_shared, followup_states: self.followup_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._set_optimizer(cost_function) if self.momentum > 0: self.optimizer = lasagne.updates.apply_momentum( self.optimizer, None, self.momentum ) _logger.debug("Compiling _theano_train") self._theano_train = theano.function( [], [cost_function, qvalues], updates=self.optimizer, givens=self.observations) _logger.debug("Compiling _theano_get_Q") self._theano_get_Q = theano.function( [], qvalues, givens={states: self.states_shared}) self.callback = None _logger.debug("%s" % self)
def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(DeepNNDropoutCritic, self).__init__(n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) self._dropout_p = settings_['dropout_p'] # data types for model self._State = T.matrix("State") self._State.tag.test_value = np.random.rand(self._batch_size, self._state_length) self._ResultState = T.matrix("ResultState") self._ResultState.tag.test_value = np.random.rand( self._batch_size, self._state_length) self._Reward = T.col("Reward") self._Reward.tag.test_value = np.random.rand(self._batch_size, 1) self._Action = T.matrix("Action") self._Action.tag.test_value = np.random.rand(self._batch_size, self._action_length) # create a small convolutional neural network input = lasagne.layers.InputLayer((None, self._state_length), self._State) self._stateInputVar = input.input_var # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) """ network = lasagne.layers.DenseLayer( network, num_units=256, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) """ network = lasagne.layers.DenseLayer( input, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=16, nonlinearity=lasagne.nonlinearities.leaky_rectify) # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) self._critic = lasagne.layers.DenseLayer( network, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((n_out,)) # networkAct = lasagne.layers.InputLayer((None, self._state_length), self._State) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) """ networkAct = lasagne.layers.DenseLayer( networkAct, num_units=256, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) """ networkAct = lasagne.layers.DenseLayer( input, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) networkAct = lasagne.layers.DenseLayer( networkAct, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) networkAct = lasagne.layers.DenseLayer( networkAct, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) self._actor = lasagne.layers.DenseLayer( networkAct, num_units=self._action_length, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((n_out,)) # print "Initial W " + str(self._w_o.get_value()) self._states_shared = theano.shared( np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=theano.config.floatX), )
def __init__(self, n_in, n_out): batch_size = 32 state_length = n_in action_length = n_out # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, state_length) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, state_length) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.dmatrix("Action") Action.tag.test_value = np.random.rand(batch_size, action_length) # create a small convolutional neural network inputLayerA = lasagne.layers.InputLayer((None, state_length), State) l_hid1A = lasagne.layers.DenseLayer( inputLayerA, num_units=128, nonlinearity=lasagne.nonlinearities.rectify) l_hid2A = lasagne.layers.DenseLayer( l_hid1A, num_units=64, nonlinearity=lasagne.nonlinearities.rectify) l_hid3A = lasagne.layers.DenseLayer( l_hid2A, num_units=32, nonlinearity=lasagne.nonlinearities.rectify) self._l_outA = lasagne.layers.DenseLayer( l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((n_out,)) inputLayerActA = lasagne.layers.InputLayer((None, state_length), State) l_hid1ActA = lasagne.layers.DenseLayer( inputLayerActA, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2ActA = lasagne.layers.DenseLayer( l_hid1ActA, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActA = lasagne.layers.DenseLayer( l_hid2ActA, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActA = lasagne.layers.DenseLayer( l_hid3ActA, num_units=action_length, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((n_out,)) # self.updateTargetModel() inputLayerB = lasagne.layers.InputLayer((None, state_length), State) l_hid1B = lasagne.layers.DenseLayer( inputLayerB, num_units=128, nonlinearity=lasagne.nonlinearities.rectify) l_hid2B = lasagne.layers.DenseLayer( l_hid1B, num_units=64, nonlinearity=lasagne.nonlinearities.rectify) l_hid3B = lasagne.layers.DenseLayer( l_hid2B, num_units=32, nonlinearity=lasagne.nonlinearities.rectify) self._l_outB = lasagne.layers.DenseLayer( l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear) inputLayerActB = lasagne.layers.InputLayer((None, state_length), State) l_hid1ActB = lasagne.layers.DenseLayer( inputLayerActB, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2ActB = lasagne.layers.DenseLayer( l_hid1ActB, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActB = lasagne.layers.DenseLayer( l_hid2ActB, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActB = lasagne.layers.DenseLayer( l_hid3ActB, num_units=n_out, nonlinearity=lasagne.nonlinearities.linear) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = 0.001 self._discount_factor = 0.8 self._rho = 0.95 self._rms_epsilon = 0.001 self._weight_update_steps = 5000 self._updates = 0 self._states_shared = theano.shared( np.zeros((batch_size, state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((batch_size, state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared( np.zeros((batch_size, n_out), dtype=theano.config.floatX), ) self._q_valsA = lasagne.layers.get_output(self._l_outA, State) self._q_valsB = lasagne.layers.get_output(self._l_outB, ResultState) self._q_valsActA = lasagne.layers.get_output(self._l_outActA, State) self._q_valsActB = lasagne.layers.get_output(self._l_outActB, State) self._q_func = self._q_valsA self._q_funcAct = self._q_valsActA # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True) target = (Reward + self._discount_factor * self._q_valsB) diff = target - self._q_valsA loss = 0.5 * diff**2 + ( 1e-6 * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)) loss = T.mean(loss) params = lasagne.layers.helper.get_all_params(self._l_outA) actionParams = lasagne.layers.helper.get_all_params(self._l_outActA) givens_ = { State: self._states_shared, ResultState: self._next_states_shared, Reward: self._rewards_shared, # Action: self._actions_shared, } actGivens = { State: self._states_shared, # ResultState: self._next_states_shared, # Reward: self._rewards_shared, Action: self._actions_shared, } # SGD update #updates_ = lasagne.updates.rmsprop(loss, params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update updates_ = lasagne.updates.rmsprop( T.mean(self._q_func) + (1e-6 * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)), params, self._learning_rate * -T.mean(diff), self._rho, self._rms_epsilon) # actDiff1 = (Action - self._q_valsActB) #TODO is this correct? # actDiff = (actDiff1 - (Action - self._q_valsActA)) actDiff = ((Action - self._q_valsActA) ) # Target network does not work well here? actLoss = 0.5 * actDiff**2 + ( 1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2)) actLoss = T.sum(actLoss) / float(batch_size) # actionUpdates = lasagne.updates.rmsprop(actLoss + # (1e-4 * lasagne.regularization.regularize_network_params( # self._l_outActA, lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.01 * (-actLoss), self._rho, self._rms_epsilon) actionUpdates = lasagne.updates.rmsprop( T.mean(self._q_funcAct) + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2)), actionParams, self._learning_rate * 0.5 * (-T.sum(actDiff) / float(batch_size)), self._rho, self._rms_epsilon) self._train = theano.function([], [loss, self._q_valsA], updates=updates_, givens=givens_) self._trainActor = theano.function([], [actLoss, self._q_valsActA], updates=actionUpdates, givens=actGivens) self._q_val = theano.function([], self._q_valsA, givens={State: self._states_shared}) self._q_action = theano.function([], self._q_valsActA, givens={State: self._states_shared}) self._bellman_error = theano.function( inputs=[State, Reward, ResultState], outputs=diff, allow_input_downcast=True)
def __init__(self, num_actions, phi_length, width, height, discount=.9, learning_rate=.01, batch_size=32, approximator='none'): self._batch_size = batch_size self._num_input_features = phi_length self._phi_length = phi_length self._img_width = width self._img_height = height self._discount = discount self.num_actions = num_actions self.learning_rate = learning_rate self.scale_input_by = 255.0 print "neural net initialization, lr is: ", self.learning_rate, approximator # CONSTRUCT THE LAYERS self.q_layers = [] self.q_layers.append( layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_height, self._img_width, self.scale_input_by)) if approximator == 'cuda_conv': self.q_layers.append( cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1])) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=16, filter_size=8, stride=4, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=32, filter_size=4, stride=2, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1])) elif approximator == 'conv': self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=16, filter_width=8, filter_height=8, stride_x=4, stride_y=4, weights_std=.01, init_bias_value=0.01)) self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=32, filter_width=4, filter_height=4, stride_x=2, stride_y=2, weights_std=.01, init_bias_value=0.01)) if approximator == 'cuda_conv' or approximator == 'conv': self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=256, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.rectify)) self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=num_actions, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.identity)) if approximator == 'none': self.q_layers.append(\ layers.DenseLayerNoBias(self.q_layers[-1], n_outputs=num_actions, weights_std=0.00, dropout=0, nonlinearity=layers.identity)) self.q_layers.append(layers.OutputLayer(self.q_layers[-1])) for i in range(len(self.q_layers) - 1): print self.q_layers[i].get_output_shape() # Now create a network (using the same weights) # for next state q values self.next_layers = copy_layers(self.q_layers) self.next_layers[0] = layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_width, self._img_height, self.scale_input_by) self.next_layers[1].input_layer = self.next_layers[0] self.rewards = T.col() self.actions = T.icol() # Build the loss function ... print "building loss funtion" q_vals = self.q_layers[-1].predictions() next_q_vals = self.next_layers[-1].predictions() next_maxes = T.max(next_q_vals, axis=1, keepdims=True) target = self.rewards + discount * next_maxes target = theano.gradient.consider_constant(target) diff = target - q_vals # Zero out all entries for actions that were not chosen... mask = build_mask(T.zeros_like(diff), self.actions, 1.0) diff_masked = diff * mask error = T.mean(diff_masked**2) self._loss = error * diff_masked.shape[1] # self._parameters = layers.all_parameters(self.q_layers[-1]) self._idx = T.lscalar('idx') # CREATE VARIABLES FOR INPUT AND OUTPUT self.states_shared = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.states_shared_next = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (1, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'), broadcastable=(False, True)) self._givens = \ {self.q_layers[0].input_var: self.states_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.next_layers[0].input_var: self.states_shared_next[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.rewards: self.rewards_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :], self.actions: self.actions_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :] } self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\ self._loss, self._parameters, learning_rate=self.learning_rate, rho=0.9, momentum=0.9, epsilon=1e-6) self._train = theano.function([self._idx], self._loss, givens=self._givens, updates=self._updates) self._compute_loss = theano.function([self._idx], self._loss, givens=self._givens) self._compute_q_vals = \ theano.function([self.q_layers[0].input_var], self.q_layers[-1].predictions(), on_unused_input='ignore')
def __init__(self, num_actions): # remember parameters self.num_actions = num_actions self.batch_size = BATCH_SIZE self.discount_rate = DISCOUNT_RATE self.history_length = HISTORY_LENGTH self.screen_dim = DIMS self.img_height = SCREEN_HEIGHT self.img_width = SCREEN_WIDTH self.clip_error = CLIP_ERROR self.input_color_scale = COLOR_SCALE self.target_steps = TARGET_STEPS self.train_iterations = TRAIN_STEPS self.train_counter = 0 self.momentum = MOMENTUM self.update_rule = UPDATE_RULE self.learning_rate = LEARNING_RATE self.rms_decay = RMS_DECAY self.rms_epsilon = RMS_EPSILON self.rng = np.random.RandomState(RANDOM_SEED) # set seed lasagne.random.set_rng(self.rng) # prepare tensors once and reuse them states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # terminals are bool for our case terminals = T.bcol('terminals') # create shared theano variables self.states_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) # !broadcast ? self.rewards_shared = theano.shared( np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( #np.zeros((self.batch_size, 1), dtype='int32'), np.zeros((self.batch_size, 1), dtype='int8'), broadcastable=(False, True)) # can add multiple nets here self.l_primary = self.build_network() if self.target_steps > 0: self.l_secondary = self.build_network() self.copy_to_secondary() """ # input scale i.e. division can be applied to input directly also to normalize """ # define output symbols q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale) if self.target_steps > 0: q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale) else: # why this ? q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale) q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary) # target = r + max target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True)) """ # check what this does """ diff = target - q_vals[T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) # print shape ? if self.clip_error > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_error) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_error * linear_part else: loss = 0.5 * diff ** 2 loss = T.sum(loss) params = lasagne.layers.helper.get_all_params(self.l_primary) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } g_time = time.time() logger.info("graph compiling") if self.update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.learning_rate, self.rms_decay, self.rms_epsilon) elif self.update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.learning_rate, self.rms_decay, self.rms_epsilon) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}) logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
def __init__(self, num_actions, phi_length, width, height, discount, learning_rate, decay, momentum=0, batch_size=32, approximator='none'): self._batch_size = batch_size self._num_input_features = phi_length self._phi_length = phi_length self._img_width = width self._img_height = height self._discount = discount self.num_actions = num_actions self.learning_rate = learning_rate self.decay = decay self.momentum = momentum self.scale_input_by = 255.0 # CONSTRUCT THE LAYERS self.q_layers = [] self.q_layers.append(layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_height, self._img_width, self.scale_input_by)) if approximator == 'cuda_conv': self.q_layers.append(cc_layers.ShuffleBC01ToC01BLayer( self.q_layers[-1])) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=16, filter_size=8, stride=4, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=32, filter_size=4, stride=2, weights_std=.01, init_bias_value=0.1)) self.q_layers.append(cc_layers.ShuffleC01BToBC01Layer( self.q_layers[-1])) elif approximator == 'conv': self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1], n_filters=16, filter_width=8, filter_height=8, stride_x=4, stride_y=4, weights_std=.01, init_bias_value=0.01)) self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1], n_filters=32, filter_width=4, filter_height=4, stride_x=2, stride_y=2, weights_std=.01, init_bias_value=0.01)) if approximator == 'cuda_conv' or approximator == 'conv': self.q_layers.append(layers.DenseLayer(self.q_layers[-1], n_outputs=256, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.rectify)) self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=num_actions, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.identity)) if approximator == 'none': self.q_layers.append(\ layers.DenseLayerNoBias(self.q_layers[-1], n_outputs=num_actions, weights_std=0.00, dropout=0, nonlinearity=layers.identity)) self.q_layers.append(layers.OutputLayer(self.q_layers[-1])) for i in range(len(self.q_layers)-1): print self.q_layers[i].get_output_shape() # Now create a network (using the same weights) # for next state q values self.next_layers = copy_layers(self.q_layers) self.next_layers[0] = layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_width, self._img_height, self.scale_input_by) self.next_layers[1].input_layer = self.next_layers[0] self.rewards = T.col() self.actions = T.icol() # Build the loss function ... q_vals = self.q_layers[-1].predictions() next_q_vals = self.next_layers[-1].predictions() next_maxes = T.max(next_q_vals, axis=1, keepdims=True) target = self.rewards + discount * next_maxes target = theano.gradient.consider_constant(target) diff = target - q_vals # Zero out all entries for actions that were not chosen... mask = build_mask(T.zeros_like(diff), self.actions, 1.0) diff_masked = diff * mask error = T.mean(diff_masked ** 2) self._loss = error * diff_masked.shape[1] # self._parameters = layers.all_parameters(self.q_layers[-1]) self._idx = T.lscalar('idx') # CREATE VARIABLES FOR INPUT AND OUTPUT self.states_shared = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.states_shared_next = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((1, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((1, 1), dtype='int32'), broadcastable=(False, True)) self._givens = \ {self.q_layers[0].input_var: self.states_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.next_layers[0].input_var: self.states_shared_next[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.rewards: self.rewards_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :], self.actions: self.actions_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :] } if self.momentum != 0: self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\ self._loss, self._parameters, learning_rate=self.learning_rate, rho=self.decay, momentum=self.momentum, epsilon=1e-6) else: self._updates = layers.gen_updates_rmsprop(self._loss, self._parameters, learning_rate=self.learning_rate, rho=self.decay, epsilon=1e-6) self._train = theano.function([self._idx], self._loss, givens=self._givens, updates=self._updates) self._compute_loss = theano.function([self._idx], self._loss, givens=self._givens) self._compute_q_vals = \ theano.function([self.q_layers[0].input_var], self.q_layers[-1].predictions(), on_unused_input='ignore')
def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength, networkType="conv", updateRule="deepmind_rmsprop", batchAccumulator="sum", clipDelta=1.0, inputScale=255.0): self.batchSize = batchSize self.numFrames = numFrames self.inputWidth = inputWidth self.inputHeight = inputHeight self.inputScale = inputScale self.numActions = numActions self.discountRate = discountRate self.learningRate = learningRate self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.networkUpdateDelay = networkUpdateDelay self.useSARSAUpdate = useSARSAUpdate self.kReturnLength = kReturnLength self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.clipDelta = clipDelta self.updateCounter = 0 states = T.tensor4("states") nextStates = T.tensor4("nextStates") rewards = T.col("rewards") actions = T.icol("actions") nextActions = T.icol("nextActions") terminals = T.icol("terminals") self.statesShared = theano.shared( np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.nextStatesShared = theano.shared( np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.rewardsShared = theano.shared(np.zeros( (self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.qValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale) if self.networkUpdateDelay > 0: self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) self.resetNextQValueNetwork() nextQValues = lasagne.layers.get_output( self.nextQValueNetwork, nextStates / self.inputScale) else: nextQValues = lasagne.layers.get_output( self.qValueNetwork, nextStates / self.inputScale) nextQValues = theano.gradient.disconnected_grad(nextQValues) if self.useSARSAUpdate: target = rewards + terminals * ( self.discountRate** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape( (-1, ))].reshape((-1, 1)) else: target = rewards + terminals * ( self.discountRate**self.kReturnLength) * T.max( nextQValues, axis=1, keepdims=True) targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1, ))].reshape( (-1, 1)) quadraticPart = T.minimum(abs(targetDifference), self.clipDelta) linearPart = abs(targetDifference) - quadraticPart # if self.clipDelta > 0: # targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta) if self.batchAccumulator == "sum": # loss = T.sum(targetDifference ** 2) loss = T.sum(0.5 * quadraticPart**2 + self.clipDelta * linearPart) elif self.batchAccumulator == "mean": # loss = T.mean(targetDifference ** 2) loss = T.mean(0.5 * quadraticPart**2 + self.clipDelta * linearPart) else: raise ValueError("Bad Network Accumulator. {sum, mean} expected") networkParameters = lasagne.layers.helper.get_all_params( self.qValueNetwork) if self.updateRule == "deepmind_rmsprop": updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "rmsprop": updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "sgd": updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate) else: raise ValueError( "Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected") if self.momentum > 0: updates.lasagne.updates.apply_momentum(updates, None, self.momentum) lossGivens = { states: self.statesShared, nextStates: self.nextStatesShared, rewards: self.rewardsShared, actions: self.actionsShared, nextActions: self.nextActionsShared, terminals: self.terminalsShared } self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn') self.__computeQValues = theano.function( [], qValues, givens={states: self.statesShared})
def initialize_network(self): """ :description: this method initializes the network, updates, and theano functions for training and retrieving q values. Here's an outline: 1. build the q network and target q network 2. initialize theano symbolic variables used for compiling functions 3. initialize the theano numeric variables used as input to functions 4. formulate the symbolic loss 5. formulate the symbolic updates 6. compile theano functions for training and for getting q_values """ batch_size, input_shape = self.batch_size, self.input_shape lasagne.random.set_rng(self.rng) # 1. build the q network and target q network self.l_out = self.build_network(input_shape, self.num_actions, batch_size) self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size) self.reset_target_network() # 2. initialize theano symbolic variables used for compiling functions states = T.tensor4('states') actions = T.icol('actions') rewards = T.col('rewards') next_states = T.tensor4('next_states') # terminals are used to indicate a terminal state in the episode and hence a mask over the future # q values i.e., Q(s',a') terminals = T.icol('terminals') # 3. initialize the theano numeric variables used as input to functions self.states_shape = (batch_size,) + (1,) + input_shape self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # 4. formulate the symbolic loss q_vals = lasagne.layers.get_output(self.l_out, states) next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # a lot of the deepmind work clips the td error at 1 so we do that here # the problem is that gradient backpropagating through this minimum node # will be zero if diff is larger then 1.0 (because changing params before # the minimum does not impact the output of the minimum). To account for # this we take the part of the td error (magnitude) greater than 1.0 and simply # add it to the loss, which allows gradient to backprop but just linearly # in the td error rather than quadratically quadratic_part = T.minimum(abs(diff), 1.0) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + linear_part loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2) # 5. formulate the symbolic updates params = lasagne.layers.helper.get_all_params(self.l_out) updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) # 6. compile theano functions for training and for getting q_values givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(QPropKeras, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) ## primary network self._model = model ## Target network for DPG self._modelTarget = copy.deepcopy(model) ## Target network for PPO self._modelTarget2 = copy.deepcopy(model) # self._modelTarget = model self._learning_rate = self.getSettings()['learning_rate'] self._discount_factor = self.getSettings()['discount_factor'] self._rho = self.getSettings()['rho'] self._rms_epsilon = self.getSettings()['rms_epsilon'] self._q_valsActA = self._model.getActorNetwork()( self._model._stateInput)[:, :self._action_length] self._q_valsActASTD = self._model.getActorNetwork()( self._model._stateInput)[:, self._action_length:] self._q_valsActTarget_State = self._modelTarget2.getActorNetwork()( self._model._stateInput)[:, :self._action_length] # self._q_valsActTarget_State = self._modelTarget.getActorNetwork()(self._model._stateInput)[:,:self._action_length] # self._q_valsActTargetSTD = self._modelTarget.getActorNetwork()(self._model._stateInput)[:,self._action_length:] self._q_valsActASTD = (T.ones_like( self._q_valsActA)) * self.getSettings()['exploration_rate'] self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget_State) ) * self.getSettings()['exploration_rate'] self._Advantage = T.col("Advantage") self._Advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._LEARNING_PHASE = T.scalar( dtype='uint8', name='keras_learning_phase') # 0 = test, 1 = train self._QProp_N = T.col("QProp_N") self._QProp_N.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._QProp_N_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._q_function = self._model.getCriticNetwork()( [self._model._stateInput, self._q_valsActA]) self._q_function_Target = self._model.getCriticNetwork()( [self._model._stateInput, self._model._actionInput]) # self._value = self._model.getCriticNetwork()([self._model._stateInput, K.learning_phase()]) self._value_Target = self._modelTarget2.getValueFunction()( [self._model._stateInput]) self._value = self._model.getValueFunction()([self._model._stateInput]) # self._value = self._model.getCriticNetwork()([self._model._stateInput]) self._actor_entropy = 0.5 * T.mean((2 * np.pi * self._q_valsActASTD)) ## Compute on-policy policy gradient self._prob = likelihood(self._model._actionInput, self._q_valsActA, self._q_valsActASTD, self._action_length) ### How should this work if the target network is very odd, as in not a slightly outdated copy. self._prob_target = likelihood(self._model._actionInput, self._q_valsActTarget_State, self._q_valsActTargetSTD, self._action_length) ## This does the sum already self._r = (self._prob / self._prob_target) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (self._r), self._Advantage) ppo_epsilon = self.getSettings()['kl_divergence_threshold'] self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (theano.tensor.clip(self._r, 1.0 - ppo_epsilon, 1 + ppo_epsilon), self._Advantage)) self._actLoss_ = theano.tensor.minimum((self._actLoss_), (self._actLoss_2)) # self._actLoss = ((T.mean(self._actLoss_) )) + -self._actor_regularization # self._actLoss = (-1.0 * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy ))) self._actLoss = -1.0 * (T.mean(self._actLoss_) + T.mean(self._QProp_N * self._q_function)) self._actLoss_PPO = -1.0 * (T.mean(self._actLoss_)) # self._policy_grad = T.grad(self._actLoss , self._actionParams) QPropKeras.compile(self)
def __init__(self, mean_doc_size, input, input_sums, n_visible=784, n_hidden=500, W = None, hbias = None, vbias = None, numpy_rng = None, theano_rng = None): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias """ self.mean_doc_size = mean_doc_size self.n_visible = n_visible self.n_hidden = n_hidden if numpy_rng is None: # create a number generator numpy_rng = numpy.random.RandomState(1234) if theano_rng is None : theano_rng = RandomStreams(numpy_rng.randint(2**30)) if W is None : # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible)) # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray( numpy_rng.uniform( low = -4.*numpy.sqrt(6./(n_hidden+n_visible)), high = 4.*numpy.sqrt(6./(n_hidden+n_visible)), size = (n_visible, n_hidden)), dtype = theano.config.floatX) initial_W *= 1/self.mean_doc_size # theano shared variables for weights and biases W = theano.shared(value = initial_W, name = 'W') if hbias is None : # create shared variable for hidden units bias hbias = theano.shared(value = numpy.zeros(n_hidden, dtype = theano.config.floatX), name='hbias') if vbias is None : # create shared variable for visible units bias vbias = theano.shared(value =numpy.zeros(n_visible, dtype = theano.config.floatX),name='vbias') # initialize input layer for standalone RBM or layer0 of DBN self.input = input self.input_sums = input_sums if not input: self.input = T.matrix('input') self.input_sums = T.col('input_sums') self.binomial_approx_val = theano.shared(value = float(100000), name = 'binomial_approx_val') self.W = W self.hbias = hbias self.vbias = vbias self.theano_rng = theano_rng # **** WARNING: It is not a good idea to put things in this list # other than shared variables created in this function. self.params = [self.W, self.hbias, self.vbias]
def col(name): return T.col(name)
from __future__ import print_function __author__ = 'frankhe' import theano.tensor as T from theano.compile import function import numpy as np num_actions = 3 batch_size = None q_s = T.matrix() a = T.col() # batch_size = q_s.shape[0] # out = q_s[range(batch_size), a.reshape(batch_size)] mask = T.eq(T.arange(num_actions).reshape((1, -1)), a.reshape((-1, 1))) out_t = q_s * mask out = T.sum(out_t, axis=1, keepdims=True) f = function([q_s, a], out) q_s_ = np.random.rand(5, num_actions) a_ = np.array([1, 0, 2, 1, 2]).reshape((5, 1)) print(f(q_s_, a_))
def __init__(self, state_length, action_length, state_bounds, action_bounds, settings_): super(ForwardDynamicsDenseNetworkDropoutTesting,self).__init__(state_length, action_length, state_bounds, action_bounds, 0, settings_) batch_size=32 # data types for model self._State = T.matrix("State") self._State.tag.test_value = np.random.rand(batch_size,self._state_length) self._ResultState = T.matrix("ResultState") self._ResultState.tag.test_value = np.random.rand(batch_size,self._state_length) self._Reward = T.col("Reward") self._Reward.tag.test_value = np.random.rand(self._batch_size,1) self._Action = T.matrix("Action") self._Action.tag.test_value = np.random.rand(batch_size, self._action_length) # create a small convolutional neural network input = lasagne.layers.InputLayer((None, self._state_length), self._State) self._stateInputVar = input.input_var actionInput = lasagne.layers.InputLayer((None, self._action_length), self._Action) self._actionInputVar = actionInput.input_var insert_action_later = True double_insert_action = False add_layers_after_action = False if (not insert_action_later or (double_insert_action)): input = lasagne.layers.ConcatLayer([input, actionInput]) ## Activation types # activation_type = elu_mine # activation_type=lasagne.nonlinearities.tanh activation_type=lasagne.nonlinearities.leaky_rectify # activation_type=lasagne.nonlinearities.rectify # network = lasagne.layers.DropoutLayer(input, p=self._dropout_p, rescale=True) """ network = lasagne.layers.DenseLayer( input, num_units=128, nonlinearity=activation_type) network = weight_norm(network) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) layersAct = [network] network = lasagne.layers.DenseLayer( network, num_units=64, nonlinearity=activation_type) network = weight_norm(network) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) layersAct.append(network) network = lasagne.layers.ConcatLayer([layersAct[1], layersAct[0]]) network = lasagne.layers.DenseLayer( network, num_units=32, nonlinearity=activation_type) network = weight_norm(network) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) layersAct.append(network) network = lasagne.layers.ConcatLayer([layersAct[2], layersAct[1], layersAct[0]]) ## This can be used to model the reward function self._reward_net = lasagne.layers.DenseLayer( network, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # print ("Initial W " + str(self._w_o.get_value()) ) """ network = lasagne.layers.DenseLayer( input, num_units=128, nonlinearity=activation_type) # network = weight_norm(network) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) # layersAct = [network] if ( insert_action_later ): ### Lets try adding the action input later on in the network if ( add_layers_after_action ): networkA = lasagne.layers.DenseLayer( actionInput, num_units=32, nonlinearity=activation_type) network = lasagne.layers.ConcatLayer([network, networkA]) else: network = lasagne.layers.ConcatLayer([network, actionInput]) network = lasagne.layers.DenseLayer( network, num_units=64, nonlinearity=activation_type) # network = weight_norm(network) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) # layersAct.append(network) # network = lasagne.layers.ConcatLayer([layersAct[1], layersAct[0]]) network = lasagne.layers.DenseLayer( network, num_units=32, nonlinearity=activation_type) # network = weight_norm(network) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) # layersAct.append(network) # network = lasagne.layers.ConcatLayer([layersAct[2], layersAct[1], layersAct[0]]) # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=8, nonlinearity=activation_type) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) """ network = lasagne.layers.DenseLayer( network, num_units=8, nonlinearity=activation_type) """ ## This can be used to model the reward function self._reward_net = lasagne.layers.DenseLayer( network, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # print ("Initial W " + str(self._w_o.get_value()) ) # networkAct = lasagne.layers.DropoutLayer(input, p=self._dropout_p, rescale=True) networkAct = lasagne.layers.DenseLayer( input, num_units=256, nonlinearity=activation_type) networkAct = weight_norm(networkAct) networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) layersAct = [networkAct] networkAct = lasagne.layers.DenseLayer( networkAct, num_units=128, nonlinearity=activation_type) networkAct = weight_norm(networkAct) networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) if ( insert_action_later ): ### Lets try adding the action input later on in the network if ( add_layers_after_action ): networkActA = lasagne.layers.DenseLayer( actionInput, num_units=64, nonlinearity=activation_type) networkAct = lasagne.layers.ConcatLayer([networkAct, networkActA]) else: networkAct = lasagne.layers.ConcatLayer([networkAct, actionInput]) layersAct.append(networkAct) networkAct = lasagne.layers.ConcatLayer([layersAct[1], layersAct[0]]) networkAct = lasagne.layers.DenseLayer( networkAct, num_units=128, nonlinearity=activation_type) networkAct = weight_norm(networkAct) networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) layersAct.append(networkAct) networkAct = lasagne.layers.ConcatLayer([layersAct[2], layersAct[1], layersAct[0]]) self._forward_dynamics_net = lasagne.layers.DenseLayer( networkAct, num_units=self._state_length, nonlinearity=lasagne.nonlinearities.linear) # print ("Initial W " + str(self._w_o.get_value()) ) if (('use_stochastic_forward_dynamics' in self._settings) and self._settings['use_stochastic_forward_dynamics']): with_std = lasagne.layers.DenseLayer( networkAct, num_units=self._state_length, nonlinearity=theano.tensor.nnet.softplus) self._forward_dynamics_net = lasagne.layers.ConcatLayer([self._forward_dynamics_net, with_std], axis=1) self._states_shared = theano.shared( np.zeros((batch_size, self._state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((batch_size, self._state_length), dtype=theano.config.floatX)) self._actions_shared = theano.shared( np.zeros((batch_size, self._action_length), dtype=theano.config.floatX), ) self._rewards_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True))
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, update_rule, batch_accumulator, randomState, frame_scale=255.0): """ Initialize environment Arguments: environment - the environment (class Env) num_elements_in_batch - list of k integers for the number of each element kept as belief state num_actions - int discount - float learning_rate - float rho, rms_epsilon, momentum - float, float, float ... network_type - string ... """ self._environment = environment self._batchSize = batchSize self._inputDimensions = self._environment.inputDimensions() self._nActions = self._environment.nActions() self._df = 0 self.rho = rho self._lr = 0 self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._randomState = randomState lasagne.random.set_rng(self._randomState) self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._inputDimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out) next_q_vals = lasagne.layers.get_output(self.next_l_out) max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True) T_ones_like=T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1)) diff = target - q_val if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) for conv_param in self.l_outs_conv: for p in lasagne.layers.helper.get_all_params(conv_param): params.append(p) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x if update_rule == 'deepmind_rmsprop': grads = get_or_compute_grads(loss, params) updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, thelr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], q_vals, givens=givens2, on_unused_input='warn')
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng # print "NETWORK---------------------------" # print "input width ", self.input_width # print "input height", self.input_height # print "num actiuons", self.num_actions # print "num frames", self.num_frames # print "batch size", self.batch_size # print "discount", self.discount # print "rho", self.rho # print "lr", self.lr # print "rms_epsilon", self.rms_epsilon # print "momentum", self.momentum # print "clip_delta", self.clip_delta # print "freeze_ intercal", self.freeze_interval # print "rng", self.rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed state transitions, # each consisting of num_frames + 1 (due to overlap) images, along with # the chosen action and resulting reward and termnial status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames + 1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals self.state_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
def __init__(self, input_width, input_height, num_channels, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, network_params, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_channels = num_channels self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.lstm = None self.next_lstm = None logging.debug('network parameters', network_params) self.network_params = network_params lasagne.random.set_rng(self.rng) self.update_counter = 0 networks = self.build_network(network_type, num_channels, input_width, input_height, num_actions, num_frames, None) if isinstance(networks, tuple): self.l_out = networks[0] self.lstm = networks[1] else: self.l_out = networks # theano.compile.function_dump('network.dump', self.l_out) if self.freeze_interval > 0: next_networks = self.build_network(network_type, num_channels, input_width, input_height, num_actions, num_frames, None) if isinstance(next_networks, tuple): self.next_l_out = next_networks[0] self.next_lstm = next_networks[1] else: self.next_l_out = next_networks self.reset_q_hat() # This really really needs to be floats for now. # It makes sense if they use it for computations btensor5 = T.TensorType(theano.config.floatX, (False,) * 5) states = btensor5('states') next_states = btensor5('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Apparently needed for some layers with a variable input size # Weird, because the others just allow a None batch size, # but let's just play safe for now # For now, it should always look exactly like states # (n_batch, n_time_steps) # mask = T.imatrix('mask') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, num_channels, input_height, input_width), dtype=theano.config.floatX), name='states') self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, num_channels, input_height, input_width), dtype=theano.config.floatX), name='next_states') self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True), name='rewards') self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True), name='actions') self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # self.mask_shared = theano.shared(np.ones((batch_size, num_frames), # dtype='int32')) # lstmout = lasagne.layers.get_output(self.lstm, states / input_scale) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) # mask_input=mask) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale ) # mask_input=mask) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale ) # mask_input=mask) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(target.shape[0]), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) # print params givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': update_for = lambda params: deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': update_for = lambda params: lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': update_for = lambda params: lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) updates = update_for(params) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) # # Super mega shady stuff # # Somehow an update sneaks in for cell and hid. Kill it with fire if self.lstm: delete_keys = [k for k, v in updates.items() if k.name in ['cell', 'hid']] # print delete_keys for key in delete_keys: del updates[key] self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength, networkType = "conv", updateRule = "deepmind_rmsprop", batchAccumulator = "sum", clipDelta = 1.0, inputScale = 255.0): self.batchSize = batchSize self.numFrames = numFrames self.inputWidth = inputWidth self.inputHeight = inputHeight self.inputScale = inputScale self.numActions = numActions self.discountRate = discountRate self.learningRate = learningRate self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.networkUpdateDelay = networkUpdateDelay self.useSARSAUpdate = useSARSAUpdate self.kReturnLength = kReturnLength self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.clipDelta = clipDelta self.updateCounter = 0 states = T.tensor4("states") nextStates = T.tensor4("nextStates") rewards = T.col("rewards") actions = T.icol("actions") nextActions= T.icol("nextActions") terminals = T.icol("terminals") self.statesShared = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.nextStatesShared = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.rewardsShared = theano.shared(np.zeros((self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.qValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale) if self.networkUpdateDelay > 0: self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) self.resetNextQValueNetwork() nextQValues = lasagne.layers.get_output(self.nextQValueNetwork, nextStates / self.inputScale) else: nextQValues = lasagne.layers.get_output(self.qValueNetwork, nextStates / self.inputScale) nextQValues = theano.gradient.disconnected_grad(nextQValues) if self.useSARSAUpdate: target = rewards + terminals * (self.discountRate ** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape((-1,))].reshape((-1, 1)) else: target = rewards + terminals * (self.discountRate ** self.kReturnLength) * T.max(nextQValues, axis = 1, keepdims = True) targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1,))].reshape((-1, 1)) quadraticPart = T.minimum(abs(targetDifference), self.clipDelta) linearPart = abs(targetDifference) - quadraticPart # if self.clipDelta > 0: # targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta) if self.batchAccumulator == "sum": # loss = T.sum(targetDifference ** 2) loss = T.sum(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart) elif self.batchAccumulator == "mean": # loss = T.mean(targetDifference ** 2) loss = T.mean(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart) else: raise ValueError("Bad Network Accumulator. {sum, mean} expected") networkParameters = lasagne.layers.helper.get_all_params(self.qValueNetwork) if self.updateRule == "deepmind_rmsprop": updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "rmsprop": updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "sgd": updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate) else: raise ValueError("Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected") if self.momentum > 0: updates.lasagne.updates.apply_momentum(updates, None, self.momentum) lossGivens = { states: self.statesShared, nextStates: self.nextStatesShared, rewards:self.rewardsShared, actions: self.actionsShared, nextActions: self.nextActionsShared, terminals: self.terminalsShared } self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn') self.__computeQValues = theano.function([], qValues, givens={states: self.statesShared})
def __init__(self, input, n_in, n_out): hidden_size = 36 batch_size = 32 self._w_h = init_weights((n_in, hidden_size)) self._b_h = init_b_weights((1, hidden_size)) # self._b_h = init_b_weights((hidden_size,)) self._w_h2 = init_weights((hidden_size, hidden_size)) self._b_h2 = init_b_weights((1, hidden_size)) # self._b_h2 = init_b_weights((hidden_size,)) # self._w_o = init_tanh(hidden_size, n_out) self._w_o = init_weights((hidden_size, n_out)) self._b_o = init_b_weights((1, n_out)) # self._b_o = init_b_weights((n_out,)) self.updateTargetModel() self._w_h_old = init_weights((n_in, hidden_size)) self._w_h2_old = init_weights((hidden_size, hidden_size)) self._w_o_old = init_tanh(hidden_size, n_out) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = 0.00025 self._discount_factor = 0.99 self._weight_update_steps = 5000 self._updates = 0 # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, 2) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, 2) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.icol("Action") Action.tag.test_value = np.zeros((batch_size, 1), dtype=np.dtype('int32')) # Q_val = T.fmatrix() # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1))) # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True) _py_xA = self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0) _py_xB = self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.0, 0.0) self._y_predA = T.argmax(_py_xA, axis=1) self._y_predB = T.argmax(_py_xB, axis=1) self._q_funcA = T.mean( (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0))[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1))) self._q_funcB = T.mean( (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.0, 0.0))[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1))) # q_val = py_x # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5) # L1 norm ; one regularization option is to enforce L1 norm to # be small self._L1_A = (abs(self._w_h).sum() + abs(self._w_h2).sum() + abs(self._w_o).sum()) self._L1_B = (abs(self._w_h_old).sum() + abs(self._w_h2_old).sum() + abs(self._w_o_old).sum()) self._L1_reg = 0.0 self._L2_reg = 0.001 # L2 norm ; one regularization option is to enforce # L2 norm to be small self._L2_A = ((self._w_h**2).sum() + (self._w_h2**2).sum() + (self._w_o**2).sum()) self._L2_B = ((self._w_h_old**2).sum() + (self._w_h2_old**2).sum() + (self._w_o_old**2).sum()) # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State)) deltaA = ((Reward + (self._discount_factor * T.max(self.model( ResultState, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5), axis=1, keepdims=True))) - (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5))[T.arange(Action.shape[0]), Action.reshape((-1, ))].reshape((-1, 1))) deltaB = ( (Reward + (self._discount_factor * T.max(self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5), axis=1, keepdims=True))) - (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5))[T.arange(Action.shape[0]), Action.reshape((-1, ))].reshape((-1, 1))) # bellman_cost = T.mean( 0.5 * ((delta) ** 2 )) bellman_costA = T.mean(0.5 * ((deltaA)**2)) + ( self._L2_reg * self._L2_A) + (self._L1_reg * self._L1_A) bellman_costB = T.mean(0.5 * ((deltaB)**2)) + ( self._L2_reg * self._L2_B) + (self._L1_reg * self._L1_B) paramsA = [ self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o ] paramsB = [ self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old ] # updates = sgd(bellman_cost, params, lr=self._learning_rate) updatesA = rlTDSGD(self._q_funcA, T.mean(deltaA), paramsA, lr=self._learning_rate) updatesB = rlTDSGD(self._q_funcB, T.mean(deltaB), paramsB, lr=self._learning_rate) # updates = RMSprop(bellman_cost, params, lr=self._learning_rate) # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate) # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01) # updatesA = lasagne.updates.rmsprop(self._q_funcA, paramsA, self._learning_rate * -T.mean(deltaA), 0.95, 0.01) # updatesB = lasagne.updates.rmsprop(self._q_funcB, paramsB, self._learning_rate * -T.mean(deltaB), 0.95, 0.01) self._trainA = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_costA, updates=updatesA, allow_input_downcast=True) self._trainB = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_costB, updates=updatesB, allow_input_downcast=True) self._bellman_errorA = theano.function( inputs=[State, Action, Reward, ResultState], outputs=deltaA, allow_input_downcast=True) self._bellman_errorB = theano.function( inputs=[State, Action, Reward, ResultState], outputs=deltaB, allow_input_downcast=True) self._q_valuesA = theano.function(inputs=[State], outputs=_py_xA, allow_input_downcast=True) self._q_valuesB = theano.function(inputs=[State], outputs=_py_xB, allow_input_downcast=True) self._py_xA = theano.function(inputs=[State], outputs=_py_xA, allow_input_downcast=True) self._py_xB = theano.function(inputs=[State], outputs=_py_xB, allow_input_downcast=True) x, y = T.matrices('x', 'y') z_lazy = ifelse(T.gt(T.max(x, axis=1)[0], T.max(y, axis=1)[0]), T.argmax(x, axis=1), T.argmax(y, axis=1)) self._f_lazyifelse = theano.function([x, y], z_lazy, mode=theano.Mode(linker='vm'))
def _update_classifier(self, data, labels, w, classes): """Update the classifier parameters theta and bias Parameters ---------- data : list of 2D arrays, element i has shape=[voxels_i, samples_i] Each element in the list contains the fMRI data of one subject for the classification task. labels : list of arrays of int, element i has shape=[samples_i] Each element in the list contains the labels for the data samples in data_sup. w : list of 2D array, element i has shape=[voxels_i, features] The orthogonal transforms (mappings) :math:`W_i` for each subject. classes : int The number of classes in the classifier. Returns ------- theta : array, shape=[features, classes] The MLR parameter for the class planes. bias : array shape=[classes,] The MLR parameter for class biases. """ # Stack the data and labels for training the classifier data_stacked, labels_stacked, weights = \ SSSRM._stack_list(data, labels, w) features = w[0].shape[1] total_samples = weights.size data_th = S.shared(data_stacked.astype(theano.config.floatX)) val_ = S.shared(labels_stacked) total_samples_S = S.shared(total_samples) theta_th = T.matrix(name='theta', dtype=theano.config.floatX) bias_th = T.col(name='bias', dtype=theano.config.floatX) constf2 = S.shared(self.alpha / self.gamma, allow_downcast=True) weights_th = S.shared(weights) log_p_y_given_x = \ T.log(T.nnet.softmax((theta_th.T.dot(data_th.T)).T + bias_th.T)) f = -constf2 * T.sum( (log_p_y_given_x[T.arange(total_samples_S), val_]) / weights_th) + 0.5 * T.sum(theta_th**2) manifold = Product((Euclidean(features, classes), Euclidean(classes, 1))) problem = Problem(manifold=manifold, cost=f, arg=[theta_th, bias_th], verbosity=0) solver = ConjugateGradient(mingradnorm=1e-6) solution = solver.solve(problem) theta = solution[0] bias = solution[1] del constf2 del theta_th del bias_th del data_th del val_ del solver del solution return theta, bias
theano_dot = theano.function([theano_matrix1, theano_matrix2], T.dot(theano_matrix1, theano_matrix2), name='theano_dot') theano_scalar = T.fscalar(name='theano_scalar') theano_scale = theano.function([theano_matrix1, theano_scalar], theano_matrix1 * theano_scalar, name='scale') # elementwise product theano_multiply = theano.function([theano_matrix1, theano_matrix2], theano_matrix1 * theano_matrix2, name='theano_multiply') theano_row_vector = T.row(name='theano_row_vector') theano_col_vector = T.col(name='theano_col_vector') theano_subtract_row = theano.function([theano_matrix1, theano_row_vector], theano_matrix1 - theano_row_vector, name='theano_subtract_row') theano_divide_row = theano.function([theano_matrix1, theano_row_vector], theano_matrix1 / theano_row_vector, name='theano_divide_row') theano_subtract_col = theano.function([theano_matrix1, theano_col_vector], theano_matrix1 - theano_col_vector, name='theano_subtract_col') theano_divide_col = theano.function([theano_matrix1, theano_col_vector], theano_matrix1 / theano_col_vector, name='theano_divide_col') theano_var1 = theano.function([theano_matrix1],
def __init__(self, input_width, input_height, avail_actions, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, train_all, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.avail_actions = avail_actions self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.train_all = train_all lasagne.random.set_rng(self.rng) self.update_counter = 0 print "num_actions: " + str(num_actions) self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, datasetPaths=None, keyPoints=None): self.xMax = 1024.0 self.yMax = 576.0 self.numKeyPoints = 68 self.loadPicasaTubePickle() loadPrev = 1 if loadPrev == 1: pkl_file = open('faceAlign2.pkl', 'rb') self.pose, self.landmarks, self.poseDict, self.images, self.poseCat = pickle.load( pkl_file) pkl_file.close() else: self.loadData() output = open('faceAlign2.pkl', 'wb') data = (self.pose, self.landmarks, self.poseDict, self.images, self.poseCat) pickle.dump(data, output) output.close() return self.eeta = 0.0000001 self.mu = theano.shared(10 * numpy.random.random( (2 * self.numKeyPoints, 1))) self.S = theano.shared(numpy.eye(2 * self.numKeyPoints)) self.alpha = theano.shared(0.1 * numpy.ones( (2 * self.numKeyPoints, 1))) theano.config.compute_test_value = 'warn' oneCol = T.col('oneCol') oneCol.tag.test_value = numpy.ones((self.numKeyPoints, 1)) pi_t = T.col('pi_t') pi_t.tag.test_value = numpy.random.random((2 * self.numKeyPoints, 1)) temp = numpy.random.random((3, 3)) #temp = numpy.zeros((3,3)) temp[2, :] = [0, 0, 1] self.A_t = theano.shared(temp, name='A_t') #print_A_t = theano.printing.Print('r_t1')(A_t) z_t = T.col('z_t') z_t.tag.test_value = numpy.random.random((2 * self.numKeyPoints, 1)) z_t1 = z_t.reshape((self.numKeyPoints, 2)) pts = T.concatenate((z_t1, oneCol), axis=1) # pts = theano.printing.Print('pts')(pts) r_t = T.dot(self.A_t, pts.transpose()).transpose() r_t1 = r_t[:, 0:2].reshape((2 * self.numKeyPoints, 1)) #pi_tt = theano.printing.Print('pi_t before')(pi_t) diff = pi_t * (r_t1 - self.mu) difft = diff.reshape((1, 2 * self.numKeyPoints)) #diff = theano.printing.Print('diff:')(diff) cost = T.max(T.dot(T.dot(difft, self.S), diff)) #cost = theano.printing.Print('cost:')(cost) A_t_grad = T.grad(cost=cost, wrt=self.A_t) A_t_grad = T.basic.set_subtensor(A_t_grad[2, :], 0) #A_t_grad = theano.printing.Print('r_t1')(A_t_grad) update = (self.A_t, self.A_t - self.eeta * A_t_grad) self.align = theano.function(inputs=[pi_t, z_t, oneCol], outputs=[self.A_t, cost], updates=[update], on_unused_input='warn', allow_input_downcast=True) #for numpy optimization A_t_ = T.matrix('A_t_') #A_t_.tag.test_value = temp #A_t_ = A_t_.reshape((3,3)) A_t_.tag.test_value = temp #print_A_t = theano.printing.Print('r_t1')(A_t) r_t_ = T.dot(A_t_, pts.transpose()).transpose() r_t1_ = r_t_[:, 0:2].reshape((2 * self.numKeyPoints, 1)) #pi_tt = theano.printing.Print('pi_t before')(pi_t) diff_ = pi_t * (r_t1_ - self.mu) difft_ = diff_.reshape((1, 2 * self.numKeyPoints)) #diff = theano.printing.Print('diff:')(diff) cost_1 = T.dot(T.dot(difft_, self.S), diff_) #cost_1 = theano.printing.Print('cost is:')(cost_1) cost_ = T.max(cost_1) A_t_grad_ = T.grad(cost=cost_, wrt=A_t_) A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2, :], 0) #A_t_grad_ = A_t_grad_.reshape((9,1)) self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol], outputs=[cost_, A_t_grad_]) i = T.iscalar('index') i.tag.test_value = 0 subS = self.S[2 * i:2 * i + 2, 2 * i:2 * i + 2] #subS = theano.printing.Print('subS:')(self.S[2*i:2*i+2, 2*i:2*i+2]) det = T.abs_(subS[0, 0] * subS[1, 1] - subS[0, 1] * subS[1, 0]) subDiff = diff[(2 * i):((2 * i) + 2)] subDifft = difft[0][(2 * i):(2 * i + 2)] #intermed = theano.printing.Print('dotProd1:')(T.dot(subDifft,subS)) intermed = T.dot(subDifft, subS) #intermed2 = theano.printing.Print('dotProd2:')(T.dot(intermed,subDiff)) intermed2 = T.dot(intermed, subDiff) numrtr = T.exp(-0.5 * intermed2) k = 2 dnmntr = T.sqrt((2**k) * det) q = numrtr / dnmntr temp = ((1 - self.alpha[2 * i:2 * i + 2]) * q) / (self.alpha[2 * i:2 * i + 2] + (1 - self.alpha[2 * i:2 * i + 2]) * q) pi_t_out = T.basic.set_subtensor(pi_t[2 * i:2 * i + 2], temp) self.q_pi_update = theano.function(inputs=[i, oneCol, pi_t, z_t], outputs=[q, pi_t_out, r_t1], allow_input_downcast=True) self.train('12')
def __init__(self, input, n_in, n_out): hidden_size = 36 batch_size = 32 self._w_h = init_weights((n_in, hidden_size)) self._b_h = init_b_weights((1, hidden_size)) # self._b_h = init_b_weights((hidden_size,)) self._w_h2 = init_weights((hidden_size, hidden_size)) self._b_h2 = init_b_weights((1, hidden_size)) # self._b_h2 = init_b_weights((hidden_size,)) # self._w_o = init_tanh(hidden_size, n_out) self._w_o = init_weights((hidden_size, n_out)) self._b_o = init_b_weights((1, n_out)) # self._b_o = init_b_weights((n_out,)) self.updateTargetModel() self._w_h_old = init_weights((n_in, hidden_size)) self._w_h2_old = init_weights((hidden_size, hidden_size)) self._w_o_old = init_tanh(hidden_size, n_out) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = 0.00025 self._discount_factor = 0.99 self._weight_update_steps = 5000 self._updates = 0 # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, 2) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, 2) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.icol("Action") Action.tag.test_value = np.zeros((batch_size, 1), dtype=np.dtype('int32')) # Q_val = T.fmatrix() # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1))) # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True) py_x = self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0) y_pred = T.argmax(py_x, axis=1) q_func = T.mean((self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0))[T.arange(batch_size), Action.reshape((-1, ))].reshape( (-1, 1))) # q_val = py_x # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5) # L1 norm ; one regularization option is to enforce L1 norm to # be small self._L1 = (abs(self._w_h).sum() + abs(self._w_h2).sum() + abs(self._w_o).sum()) self._L1_reg = 0.0 self._L2_reg = 0.001 # L2 norm ; one regularization option is to enforce # L2 norm to be small self._L2 = ((self._w_h**2).sum() + (self._w_h2**2).sum() + (self._w_o**2).sum()) # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State)) delta = ((Reward + (self._discount_factor * T.max(self.model( ResultState, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5), axis=1, keepdims=True))) - (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5))[T.arange(Action.shape[0]), Action.reshape((-1, ))].reshape((-1, 1))) # bellman_cost = T.mean( 0.5 * ((delta) ** 2 )) bellman_cost = T.mean(0.5 * ((delta)**2)) + ( self._L2_reg * self._L2) + (self._L1_reg * self._L1) params = [ self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o ] # updates = sgd(bellman_cost, params, lr=self._learning_rate) # updates = rlTDSGD(q_func, T.mean(delta), params, lr=self._learning_rate) # updates = RMSprop(bellman_cost, params, lr=self._learning_rate) # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate) # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01) updates = lasagne.updates.rmsprop(q_func, params, self._learning_rate * -T.mean(delta), 0.95, 0.01) self._train = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_cost, updates=updates, allow_input_downcast=True) self._predict = theano.function(inputs=[State], outputs=y_pred, allow_input_downcast=True) self._q_values = theano.function(inputs=[State], outputs=py_x, allow_input_downcast=True) self._bellman_error = theano.function( inputs=[State, Action, Reward, ResultState], outputs=delta, allow_input_downcast=True)
def _compile_train_function(self): state = T.tensor4(dtype = theano.config.floatX) action = T.col(dtype = 'uint8') reward = T.col(dtype = theano.config.floatX) terminal = T.col(dtype = 'int8') next_state = T.tensor4(dtype = theano.config.floatX) current_values_matrix = lasagne.layers.get_output(self.net, state) action_mask = T.eq(T.arange(self.num_action).reshape((1, -1)) , action.reshape((-1, 1))).astype(theano.config.floatX) current_values = T.sum(current_values_matrix * action_mask , axis = 1).reshape((-1, 1)) if self.algorithm == 'q_learning': if self.tnet is not None: target_values = lasagne.layers.get_output(self.tnet, next_state) else: target_values = lasagne.layers.get_output(self.net, next_state) bootstrap_values = T.max(target_values, axis = 1, keepdims = True) elif self.algorithm == 'double_q_learning': if self.network_type == 'duel': # Get argmax actions from advantage values select_actions = self._get_action_var(self.adv_net, next_state) else: # Get argmax actions from Q values select_actions = self._get_action_var(self.net, next_state) select_mask = T.eq(T.arange(self.num_action).reshape((1, -1)) , select_actions.astype(theano.config.floatX)) if self.tnet is not None: # Evaluate argmax actions on target network eval_values = lasagne.layers.get_output(self.tnet, next_state) else: # Evaluate argmax actions on online network # (the same as q_learning but slower) eval_values = lasagne.layers.get_output(self.net, next_state) bootstrap_values = T.sum(eval_values * select_mask , axis = 1, keepdims = True) terminal_floatX = terminal.astype(theano.config.floatX) target_values = reward + self.discount * \ (T.ones_like(terminal_floatX) - terminal_floatX) * bootstrap_values if self.tnet is None: target_values = theano.gradient.disconnected_grad(target_values) error = target_values - current_values if self.max_error > 0: # From https://github.com/spragunr/deep_q_rl/issues/46 quadratic_term = T.minimum(abs(error), self.max_error) linear_term = abs(error) - quadratic_term loss = T.sum(0.5 * quadratic_term ** 2 + linear_term * self.max_error) else: loss = T.sum(0.5 * error ** 2) net_params = lasagne.layers.get_all_params(self.net) updates = self._get_rmsprop_updates(loss, net_params , lr = Network.LEARNING_RATE, grad_momentum = Network.GRAD_MOMENTUM , sqr_momentum = Network.SGRAD_MOMENTUM , min_grad = Network.MIN_SGRAD) train_givens = { state : self.shared_states[:, :-1, :, :] / Network.INPUT_SCALE, action : self.shared_action, reward : self.shared_reward, terminal : self.shared_terminal, next_state : self.shared_states[:, 1:, :, :] / Network.INPUT_SCALE, } return theano.function([], loss, updates = updates, givens = train_givens)
if param_name in exp_params.keys(): param_value = exp_params[param_name] break return param_value ######################################################################## if __name__ == "__main__": ### this block is for testing functions W = T.matrix('W') x = T.col('x') b = T.col('b') quadratic_form = energy_function(W, b, x) compute_quad_form = theano.function([W,x,b],quadratic_form) print(compute_quad_form([[1,2],[3,4]], [[1],[1]], [[1],[1]])[0][0] == 12) grad_W, grad_b = T.grad(quadratic_form[0][0], [W,b]) comp_grad_W = theano.function([W,b,x], grad_W) comp_grad_b = theano.function([W,b,x], grad_b)
def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(DeepCNNDropoutCritic, self).__init__(n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # data types for model self._dropout_p = settings_['dropout_p'] # data types for model self._State = T.matrix("State") self._State.tag.test_value = np.random.rand(self._batch_size, self._state_length) self._ResultState = T.matrix("ResultState") self._ResultState.tag.test_value = np.random.rand( self._batch_size, self._state_length) self._Reward = T.col("Reward") self._Reward.tag.test_value = np.random.rand(self._batch_size, 1) self._Target = T.col("Target") self._Target.tag.test_value = np.random.rand(self._batch_size, 1) self._Action = T.matrix("Action") self._Action.tag.test_value = np.random.rand(self._batch_size, self._action_length) # create a small convolutional neural network input = lasagne.layers.InputLayer((None, self._state_length), self._State) self._stateInputVar = input.input_var inputAction = lasagne.layers.InputLayer((None, self._action_length), self._Action) self._actionInputVar = inputAction.input_var taskFeatures = lasagne.layers.SliceLayer( input, indices=slice(0, self._settings['num_terrain_features']), axis=1) characterFeatures = lasagne.layers.SliceLayer( input, indices=slice(self._settings['num_terrain_features'], self._state_length), axis=1) print("taskFeatures Shape:", lasagne.layers.get_output_shape(taskFeatures)) print("characterFeatures Shape:", lasagne.layers.get_output_shape(characterFeatures)) print("State length: ", self._state_length) networkAct = lasagne.layers.InputLayer((None, self._state_length), self._State) # taskFeaturesAct = lasagne.layers.SliceLayer(networkAct, indices=slice(0, self._settings['num_terrain_features']), axis=1) # characterFeaturesAct = lasagne.layers.SliceLayer(networkAct, indices=slice(self._settings['num_terrain_features'],self._state_length), axis=1) # taskFeaturesAct = lasagne.layers.DropoutLayer(taskFeaturesAct, p=self._dropout_p, rescale=True) networkAct = lasagne.layers.ReshapeLayer( taskFeatures, (-1, 1, self._settings['num_terrain_features'])) networkAct = lasagne.layers.Conv1DLayer( networkAct, num_filters=16, filter_size=8, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3) """ networkAct = lasagne.layers.Conv1DLayer( networkAct, num_filters=32, filter_size=4, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) """ networkAct = lasagne.layers.Conv1DLayer( networkAct, num_filters=8, filter_size=4, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3) self._actor_task_part = networkAct """ networkAct = lasagne.layers.Conv1DLayer( networkAct, num_filters=32, filter_size=4, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) networkAct = lasagne.layers.DenseLayer( networkAct, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) """ networkAct = lasagne.layers.FlattenLayer(networkAct, outdim=2) networkAct = lasagne.layers.ConcatLayer( [networkAct, characterFeatures], axis=1) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) networkAct = lasagne.layers.DenseLayer( networkAct, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) networkAct = lasagne.layers.DenseLayer( networkAct, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True) self._actor = lasagne.layers.DenseLayer( networkAct, num_units=self._action_length, nonlinearity=lasagne.nonlinearities.linear) if (self._settings['use_stocastic_policy']): with_std = lasagne.layers.DenseLayer( networkAct, num_units=self._action_length, nonlinearity=theano.tensor.nnet.softplus) self._actor = lasagne.layers.ConcatLayer([self._actor, with_std], axis=1) if (settings_['agent_name'] == 'algorithm.DPG.DPG'): characterFeatures = lasagne.layers.ConcatLayer( [characterFeatures, inputAction]) # taskFeatures = lasagne.layers.DropoutLayer(taskFeatures, p=self._dropout_p, rescale=True) network = lasagne.layers.ReshapeLayer( taskFeatures, (-1, 1, self._settings['num_terrain_features'])) network = lasagne.layers.Conv1DLayer( network, num_filters=16, filter_size=8, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3) """ network = lasagne.layers.Conv1DLayer( network, num_filters=32, filter_size=4, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) """ network = lasagne.layers.Conv1DLayer( network, num_filters=8, filter_size=4, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) self._critic_task_part = network """ # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3) network = lasagne.layers.Conv1DLayer( network, num_filters=32, filter_size=4, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform()) network = lasagne.layers.DenseLayer( network, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) """ network = lasagne.layers.FlattenLayer(network, outdim=2) network = lasagne.layers.ConcatLayer([network, characterFeatures], axis=1) # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) network = lasagne.layers.DenseLayer( network, num_units=16, nonlinearity=lasagne.nonlinearities.leaky_rectify) # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True) self._critic = lasagne.layers.DenseLayer( network, num_units=1, nonlinearity=lasagne.nonlinearities.linear) self._states_shared = theano.shared( np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=theano.config.floatX), )
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, input_scale=255.0, reward_bias=0.): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + reward_bias + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff ** 2) elif batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): """ In order to get this to work we need to be careful not to update the actor parameters when updating the critic. This can be an issue when the Concatenating networks together. The first first network becomes a part of the second. However you can still access the first network by itself but an updates on the second network will effect the first network. Care needs to be taken to make sure only the parameters of the second network are updated. """ super(DPG, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._Action = T.matrix("Action2") self._Action.tag.test_value = np.random.rand(self._batch_size, self._action_length) self._Tmp_Target = T.col("Tmp_Target") self._Tmp_Target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._modelTarget = copy.deepcopy(model) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = self.getSettings()['learning_rate'] self._discount_factor = self.getSettings()['discount_factor'] self._rho = self.getSettings()['rho'] self._rms_epsilon = self.getSettings()['rms_epsilon'] self._weight_update_steps = self.getSettings( )['steps_until_target_network_update'] self._updates = 0 self._decay_weight = self.getSettings()['regularization_weight'] self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) inputs_1 = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions() } self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_1) inputs_1_policy = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._q_valsActA } self._q_vals_train_policy = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_1_policy) inputs_2 = { self._modelTarget.getStateSymbolicVariable(): self._model.getResultStates(), self._modelTarget.getActionSymbolicVariable(): self._model.getActions() } self._q_valsB_ = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True) self._q_func = self._q_valsA self._q_funcB = self._q_valsB_ # self._q_funcTarget = self._q_valsTarget # self._q_func_drop = self._q_valsA_drop # self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA # self._q_funcAct_drop = self._q_valsActA_drop # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True) # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen) self._diff = self._Tmp_Target - self._q_func # self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) # self._loss_drop = T.mean(0.5 * self._diff_drop ** 2) # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16 # Need to remove the action layers from these params self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) print("******Number of Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getCriticNetwork())))) print("******Number of Action Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getActorNetwork())))) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Action: self._q_valsActTarget, # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._Fallen: self._fallen_shared self._Tmp_Target: self._tmp_target_shared } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Fallen: self._fallen_shared # self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) ## MSE update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), } ## Some cool stuff to backprop action gradients self._action_grad = T.matrix("Action_Grad") self._action_grad.tag.test_value = np.zeros( (self._batch_size, self._action_length), dtype=np.dtype(self.getSettings()['float_type'])) self._action_grad_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=self.getSettings()['float_type'])) ### Maximize wrt q function self._action_mean_grads = T.grad( cost=None, wrt=self._actionParams, known_grads={self._q_valsActA: self._action_grad_shared}), print("Action grads: ", self._action_mean_grads[0]) ## When passing in gradients it needs to be a proper list of gradient expressions self._action_mean_grads = list(self._action_mean_grads[0]) # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list)) # print ("Action grads: ", self._action_mean_grads) self._actionGRADUpdates = lasagne.updates.adam( self._action_mean_grads, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._actGradGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Fallen: self._fallen_shared, # self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO # self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_vals_train_policy) + # (self._decay_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), self._actionParams, # self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)): self._valsA = lasagne.layers.get_output( self._model._value_function, self._model.getStateSymbolicVariable(), deterministic=True) self._valsA_drop = lasagne.layers.get_output( self._model._value_function, self._model.getStateSymbolicVariable(), deterministic=False) self._valsNextState = lasagne.layers.get_output( self._model._value_function, self._model.getResultStateSymbolicVariable(), deterministic=True) self._valsTargetNextState = lasagne.layers.get_output( self._modelTarget._value_function, self._model.getResultStateSymbolicVariable(), deterministic=True) self._valsTarget = lasagne.layers.get_output( self._modelTarget._value_function, self._model.getStateSymbolicVariable(), deterministic=True) # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen) # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1) self._v_target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._valsTargetNextState) self._v_diff = self._v_target - self._valsA # loss = 0.5 * self._diff ** 2 loss_v = T.pow(self._v_diff, 2) self._v_loss = T.mean(loss_v) self._params_value = lasagne.layers.helper.get_all_params( self._model._value_function) self._givens_value = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._value_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model._value_function, lasagne.regularization.l2)) self._value_grad = T.grad( self._v_loss + self._value_regularization, self._params_value) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_value = lasagne.updates.adam( self._value_grad, self._params_value, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) ## TD update DPG.compile(self)
d_rewards_var = TT.vector('d_rewards') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) eval_grad5 = TT.vector('eval_grad5', dtype=grad[4].dtype) f_train = theano.function( inputs=[observations_var, actions_var, d_rewards_var], outputs=grad) f_update = theano.function( inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], outputs=None, updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], params, learning_rate=learning_rate)) alla = [] for i in range(10): if (load_policy):
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, randomState, DoubleQ=False, TheQNet=NN): """ Initialize environment """ QNetwork.__init__(self,environment, batch_size) self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._DoubleQ = DoubleQ self._randomState = randomState QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState) self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._input_dimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState) self.q_vals, self.params, shape_after_conv = QNet._buildDQN(states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_q_vals, self.next_params, shape_after_conv = QNet._buildDQN(next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) if(self._DoubleQ==True): givens_next={} for i, x in enumerate(self.next_states_shared): givens_next[ states[i] ] = x self.next_q_vals_current_qnet=theano.function([], self.q_vals, givens=givens_next) next_q_curr_qnet = theano.clone(self.next_q_vals) argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True) max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1)) else: max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True) T_ones_like=T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32... diff = - q_val + target if self.clip_delta > 0: # This loss function implementation is taken from # https://github.com/spragunr/deep_q_rl # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss_ind = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss_ind = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss_ind) elif batch_accumulator == 'mean': loss = T.mean(loss_ind) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x gparams=[] for p in self.params: gparam = T.grad(loss, p) gparams.append(gparam) updates = [] if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': for i,(p, g) in enumerate(zip(self.params, gparams)): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - self.rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + self.rms_epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - thelr * g)) elif update_rule == 'sgd': for i, (param, gparam) in enumerate(zip(self.params, gparams)): updates.append((param, param - thelr * gparam)) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if(self._DoubleQ==True): self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') else: self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], self.q_vals, givens=givens2, on_unused_input='warn')
def __init__(self, datasetPaths = None, keyPoints = None): self.xMax = 1024.0 self.yMax = 576.0 self.numKeyPoints = 68 self.loadPicasaTubePickle() loadPrev = 1 if loadPrev == 1: pkl_file = open('faceAlign2.pkl', 'rb') self.pose, self.landmarks, self.poseDict, self.images, self.poseCat = pickle.load(pkl_file) pkl_file.close() else: self.loadData() output = open('faceAlign2.pkl', 'wb') data = (self.pose, self.landmarks, self.poseDict, self.images, self.poseCat) pickle.dump(data, output) output.close() return self.eeta = 0.0000001 self.mu = theano.shared(10 * numpy.random.random((2*self.numKeyPoints, 1))) self.S = theano.shared(numpy.eye(2 * self.numKeyPoints)) self.alpha = theano.shared(0.1 * numpy.ones((2 * self.numKeyPoints,1))) theano.config.compute_test_value = 'warn' oneCol = T.col('oneCol') oneCol.tag.test_value = numpy.ones((self.numKeyPoints,1)) pi_t = T.col('pi_t') pi_t.tag.test_value = numpy.random.random((2*self.numKeyPoints,1)) temp = numpy.random.random((3,3)) #temp = numpy.zeros((3,3)) temp[2,:] = [0,0,1] self.A_t = theano.shared(temp, name='A_t') #print_A_t = theano.printing.Print('r_t1')(A_t) z_t = T.col('z_t') z_t.tag.test_value = numpy.random.random((2*self.numKeyPoints,1)) z_t1 = z_t.reshape((self.numKeyPoints, 2)) pts = T.concatenate((z_t1, oneCol), axis=1) # pts = theano.printing.Print('pts')(pts) r_t = T.dot(self.A_t, pts.transpose()).transpose() r_t1 = r_t[:,0:2].reshape((2*self.numKeyPoints,1)) #pi_tt = theano.printing.Print('pi_t before')(pi_t) diff = pi_t * (r_t1 - self.mu) difft = diff.reshape((1, 2 * self.numKeyPoints)) #diff = theano.printing.Print('diff:')(diff) cost = T.max(T.dot(T.dot(difft,self.S),diff)) #cost = theano.printing.Print('cost:')(cost) A_t_grad = T.grad(cost=cost, wrt=self.A_t) A_t_grad = T.basic.set_subtensor(A_t_grad[2,:],0) #A_t_grad = theano.printing.Print('r_t1')(A_t_grad) update = (self.A_t, self.A_t - self.eeta * A_t_grad) self.align = theano.function(inputs=[pi_t,z_t, oneCol], outputs=[self.A_t, cost], updates=[update], on_unused_input='warn', allow_input_downcast=True) #for numpy optimization A_t_ = T.matrix('A_t_') #A_t_.tag.test_value = temp #A_t_ = A_t_.reshape((3,3)) A_t_.tag.test_value = temp #print_A_t = theano.printing.Print('r_t1')(A_t) r_t_ = T.dot(A_t_, pts.transpose()).transpose() r_t1_ = r_t_[:,0:2].reshape((2*self.numKeyPoints,1)) #pi_tt = theano.printing.Print('pi_t before')(pi_t) diff_ = pi_t * (r_t1_ - self.mu) difft_ = diff_.reshape((1, 2 * self.numKeyPoints)) #diff = theano.printing.Print('diff:')(diff) cost_1 = T.dot(T.dot(difft_,self.S),diff_) #cost_1 = theano.printing.Print('cost is:')(cost_1) cost_ = T.max(cost_1) A_t_grad_ = T.grad(cost=cost_, wrt=A_t_) A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2,:],0) #A_t_grad_ = A_t_grad_.reshape((9,1)) self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol], outputs=[cost_, A_t_grad_]) i = T.iscalar('index') i.tag.test_value = 0 subS = self.S[2*i:2*i+2, 2*i:2*i+2] #subS = theano.printing.Print('subS:')(self.S[2*i:2*i+2, 2*i:2*i+2]) det = T.abs_(subS[0,0]*subS[1,1] - subS[0,1]*subS[1,0]) subDiff = diff[(2*i):((2*i)+2)] subDifft = difft[0][(2*i):(2*i+2)] #intermed = theano.printing.Print('dotProd1:')(T.dot(subDifft,subS)) intermed = T.dot(subDifft,subS) #intermed2 = theano.printing.Print('dotProd2:')(T.dot(intermed,subDiff)) intermed2 = T.dot(intermed,subDiff) numrtr = T.exp(-0.5 * intermed2) k = 2 dnmntr = T.sqrt((2**k) * det) q = numrtr/dnmntr temp = ((1 - self.alpha[2*i:2*i+2]) * q)/(self.alpha[2*i:2*i+2] + (1 - self.alpha[2*i:2*i+2]) * q) pi_t_out = T.basic.set_subtensor(pi_t[2*i:2*i+2], temp) self.q_pi_update = theano.function(inputs = [i, oneCol, pi_t, z_t], outputs = [q,pi_t_out, r_t1], allow_input_downcast=True) self.train('12')
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, update_rule, batch_accumulator, state_count, input_scale=255.0): self.state_count=state_count self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.matrix('states') next_states = T.matrix('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') #buferis inputu viso batch self.states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #buferis i koki state patenka visiem self.next_states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu? self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) #po 1 priimta action kiekvienam episode self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #?? turbut 0 ir 1, ar paskutine verte ar ne self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) #neaisku if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff ** 2) elif batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) # params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'adam': updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho, self.rms_epsilon) elif update_rule == 'adagrad': updates = lasagne.updates.adagrad(loss, params, self.lr, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) elif update_rule == 'momentum': updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})