def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), name='W', borrow=True) # initialize the baises b as a vector of n_out 0s self.b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) # compute vector of class-membership probabilities in symbolic form self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) # compute prediction as class whose probability is maximal in # symbolic form self.y_pred = T.argmax(self.p_y_given_x, axis=1) # parameters of the model self.params = [self.W, self.b]
def architecture(self, cons, code_layer): """Build up the architecture by theano""" for i in range(len(self.layers)-1): # Initialize shared variables init_w = cons*np.random.randn(self.layers[i], self.layers[i+1]) self.weights.append(th.shared(init_w)) init_bias = cons*np.random.randn(self.layers[i+1]) self.biases.append(th.shared(init_bias)) # Building architecture a_before = T.dot(self.a_n[i], self.weights[i]) + \ self.biases[i].dimshuffle('x', 0) a_next = self.activ(a_before) self.a_n.append(a_next) # help the optimization for param in (self.weights+self.biases): self.auxiliary.append(th.shared(np.zeros(param.get_value().shape))) self.encode = th.function([self.x], self.a_n[code_layer]) self.decode = th.function([self.a_n[code_layer]], self.a_n[-1]) # Calculate the cost and gradients Cost = (T.sum((self.a_n[-1]-self.y_hat)**2))/self.batch params = self.weights + self.biases grads = T.grad(Cost, params, disconnected_inputs='ignore') # Update parameters update_query = self.update(params, grads, self.auxiliary) self.gradient_2 = th.function(inputs=[self.x, self.y_hat], updates=update_query, outputs=Cost)
def __init__(self, x, y, in_size, out_size, prefix='lr_'): self.W = theano.shared( value=np.random.uniform( low=-np.sqrt(6. / (in_size + out_size)), high=np.sqrt(6. / (in_size + out_size)), size=(in_size, out_size) ).astype(theano.config.floatX), name='W', borrow=True ) self.b = theano.shared( value=np.random.uniform( low=-np.sqrt(6. / (in_size + out_size)), high=np.sqrt(6. / (in_size + out_size)), size=(out_size,) ).astype(theano.config.floatX), name='b', borrow=True ) self.y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b) self.y_d = T.argmax(self.y_given_x, axis=1) self.loss = -T.mean(T.log(self.y_given_x)[T.arange(y.shape[0]), y]) self.error = T.mean(T.neq(self.y_d, y)) self.params = {prefix+'W': self.W, prefix+'b': self.b}
def adam(loss, all_params, learn_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8): """ADAM update rules Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf """ updates = [] all_grads = theano.grad(loss, all_params) alpha = learn_rate t = theano.shared(np.float32(1.)) b1_t = b1 * gamma ** (t - 1.) # decay the first moment running average coefficient for theta_prev, g in zip(all_params, all_grads): m_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX)) v_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX)) m = b1_t * m_prev + (1. - b1_t) * g # update biased first moment estimate v = b2 * v_prev + (1. - b2) * g ** 2 # update biased second raw moment estimate m_hat = m / (1. - b1 ** t) # compute bias-corrected first moment estimate v_hat = v / (1. - b2 ** t) # compute bias-corrected second raw moment estimate theta = theta_prev - (alpha * m_hat) / (T.sqrt(v_hat) + e) # update parameters updates.append((m_prev, m)) updates.append((v_prev, v)) updates.append((theta_prev, theta) ) updates.append((t, t + 1.)) return updates
def __init__(self, n_in, n_out, W_init=None, b_init=None, activation=T.tanh): self.activation = activation if W_init is None: rng = numpy.random.RandomState(1234) W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W_init = theano.shared(value=W_values, name='W', borrow=True) if b_init is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b_init = theano.shared(value=b_values, name='b', borrow=True) self.W = W_init self.b = b_init # parameters of the model self.params = [self.W, self.b]
def __init__(self, adapted_iterator, n_batches, max_mem=8): self.adapted_iterator = adapted_iterator self.batch_size = adapted_iterator.get_batch_size() self.max_mem_bytes = max_mem * 1024 * 1024 self.n_batches = n_batches self.arity = self.adapted_iterator.get_arity() adapted_batch_size = self.max_mem_bytes // adapted_iterator.get_datapoint_sizes().sum() # Number o datapoints that can be held in max_mem adapted_batch_size = (adapted_batch_size // self.batch_size) * self.batch_size # Make it hold an exact integer multiple of self.batch_size if self.n_batches is None: adapted_iterator.n_batches = None else: adapted_iterator.n_batches = np.inf # The adapted iterator must loop forever, limiting the number of batches is now done here. adapted_iterator.set_batch_size(adapted_batch_size, True) # True means get smaller final minibatch # Create buffers for each of the elements self.buffers = [] self.minibatch = [] for i, dimensionality in enumerate(self.adapted_iterator.get_datapoint_dimensionalities()): self.buffers.append(theano.shared(np.zeros((adapted_batch_size, dimensionality), dtype=self.adapted_iterator.dataset.get_type(i)), name="buffer_%d" % i)) # The big buffer that holds many batches self.minibatch.append(theano.shared(value=np.zeros((self.batch_size, dimensionality), dtype=self.adapted_iterator.dataset.get_type(i)), name="minibatch_%d" % i)) self.buffer_index = 0 self.datapoints_in_buffer = 0
def shared(data): """ Place the data into shared variables. This allows Theano to copy the data to the GPU, if one is available. """ shared_x = theano.shared(numpy.asarray(data[:,0].tolist(), dtype=theano.config.floatX), borrow=True) shared_y = theano.shared(numpy.asarray(data[:,1].tolist(), dtype=theano.config.floatX), borrow=True) return shared_x, T.cast(shared_y, "int32")
def __init__(self, input, n_in, n_out): """ロジスティック回帰モデルの初期化 input: ミニバッチ単位のデータ行列(n_samples, n_in) n_in : 入力の次元数 n_out: 出力の次元数 """ # 重み行列を初期化 self.W = theano.shared(value=np.zeros((n_in, n_out), dtype=theano.config.floatX), name='W', borrow=True) # バイアスベクトルを初期化 self.b = theano.shared(value=np.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) # 各サンプルが各クラスに分類される確率を計算するシンボル # 全データを行列化してまとめて計算している # 出力は(n_samples, n_out)の行列 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) # 確率が最大のクラスのインデックスを計算 # 出力は(n_samples,)のベクトル self.y_pred = T.argmax(self.p_y_given_x, axis=1) # ロジスティック回帰モデルのパラメータ self.params = [self.W, self.b]
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh): self.input = input[0] # initialize weights into this layer if W is None: W_values = np.asarray( rng.uniform( size=(n_in, n_out), low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) # initialize bias term weights into this layer if b is None: b_values = np.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b lin_output = T.dot(self.input, self.W) + self.b self.output = ( lin_output if activation is None else activation(lin_output) ) self.params = [self.W, self.b]
def __init__(self, class_dim, word_dim, hidden_dim, sen_len, batch_size, truncate=-1): # Assign instance variables self.class_dim = class_dim self.word_dim = word_dim self.hidden_dim = hidden_dim self.sen_len = sen_len self.batch_size = batch_size self.truncate = truncate params = {} # Initialize the network parameters params["E"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim)) #Ebdding Matirx params["W"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (4, hidden_dim, hidden_dim * 4)) #W[0-1].dot(x), W[2-3].(i,f,o,c) params["B"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (2, hidden_dim * 4)) #B[0-1] for W[0-1] params["lrW"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (2, hidden_dim, class_dim)) #LR W and b params["lrb"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (class_dim)) # Assign paramenters' names self.param_names = {"orign":["E", "W", "B", "lrW", "lrb"], "cache":["mE", "mW", "mB", "mlrW", "mlrb"]} # Theano: Created shared variables self.params = {} # Model's shared variables for _n in self.param_names["orign"]: self.params[_n] = theano.shared(value=params[_n].astype(theano.config.floatX), name=_n) # Shared variables for RMSProp for _n in self.param_names["cache"]: self.params[_n] = theano.shared(value=np.zeros(params[_n[1:]].shape).astype(theano.config.floatX), name=_n) # Build model graph self.__theano_build__()
def __init__(self, filter_shape, image_shape, poolsize=(2, 2), activation_fn=sigmoid): """`filter_shape` is a tuple of length 4, whose entries are the number of filters, the number of input feature maps, the filter height, and the filter width. `image_shape` is a tuple of length 4, whose entries are the mini-batch size, the number of input feature maps, the image height, and the image width. `poolsize` is a tuple of length 2, whose entries are the y and x pooling sizes. """ self.filter_shape = filter_shape self.image_shape = image_shape self.poolsize = poolsize self.activation_fn=activation_fn # initialize weights and biases n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize)) self.w = theano.shared( np.asarray( np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape), dtype=theano.config.floatX), borrow=True) self.b = theano.shared( np.asarray( np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)), dtype=theano.config.floatX), borrow=True) self.params = [self.w, self.b]
def __init__(self, input, n_in, n_out): self.W = theano.shared( value = numpy.zeros( (n_in, n_out), dtype = theano.config.floatX ), name = 'W', borrow = True ) self.b = theano.shared( value = numpy.zeros( (n_out,), dtype = theano.config.floatX ), name = 'b', borrow = True ) self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) self.y_pred = T.argmax(self.p_y_given_x, axis = 1) self.params = [self.W, self.b] self.input = input
def init_conv_filters(self, numpy_rng, D, poolsize): ''' Convolutional Filters ''' # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = np.prod(self.filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" pooling size fan_out = (self.filter_shape[0] * np.prod(self.filter_shape[2:]) / np.prod(poolsize)) # initialize weights with random weights W_bound = np.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( init_conv_weights(-W_bound, W_bound, \ self.filter_shape, numpy_rng),borrow=True, name='W_conv') #b_values = np.zeros((self.filter_shape[0],), dtype=theano.config.floatX) #self.b = theano.shared(value=b_values, borrow=True, name='b_conv') c_values = np.zeros((self.filter_shape[1],), dtype=theano.config.floatX) self.c = theano.shared(value=c_values, borrow=True, name='b_conv') self.params = [self.W, self.c]
def __init__(self, input_variable, rng, n_in=None, n_out=None, weights=None, biases=None, activation=T.tanh): self.input_variable = input_variable if not weights: assert n_in is not None assert n_out is not None W_values = np.asarray(rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) b_values = np.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) else: W = weights b = biases self.W = W self.b = b linear_output = T.dot(self.input_variable, self.W) + self.b self.output = (linear_output if activation is None else activation(linear_output)) self.params = [self.W, self.b]
def stack_and_shared(input): """ This will take a list of input variables, turn them into theano shared variables, and return them stacked in a single tensor. Parameters ---------- input : list or object List of input variables to stack into a single shared tensor. Returns ------- tensor Symbolic tensor of the input variables stacked, or None if input was None. """ if input is None: return None elif isinstance(input, list): shared_ins = [] for _in in input: try: shared_ins.append(theano.shared(_in)) except TypeError as _: shared_ins.append(_in) return T.stack(shared_ins) else: try: _output = [theano.shared(input)] except TypeError as _: _output = [input] return T.stack(_output)
def adam(lr, tparams, grads, inp, cost): gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup) lr0 = 0.0002 b1 = 0.1 b2 = 0.001 e = 1e-8 updates = [] i = theano.shared(numpy.float32(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr0 * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height,filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows,#cols) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared(numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape) # downsample each feature map individually, using maxpooling pooled_out = downsample.max_pool_2d(input=conv_out, ds=poolsize, ignore_border=True) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # store parameters of this layer self.params = [self.W, self.b]
def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: empty = numpy.zeros_like(param.get_value()) exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name) gp = T.grad(cost, param) exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'): col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) scale = desired_norms / (1e-7 + col_norms) tmp=stepped_param * scale tmp=T.cast(tmp,'float32') #print param.type,tmp.type updates[param] = tmp else: updates[param] = stepped_param #print param.type,stepped_param.type return updates
def __init__(self, rng, input, n_in, n_out, W = None, b = None): self.input = input # initialize with 0 the weights W as a matrix of shape (n_in, n_out) if W is None: W_value = rng.normal(0.0, 1.0/numpy.sqrt(n_in), size=(n_in, n_out)) W = theano.shared(value=numpy.asarray(W_value, dtype=theano.config.floatX), name='W', borrow=True) if b is None: b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) self.W = W self.b = b self.delta_W = theano.shared(value = numpy.zeros((n_in,n_out), dtype=theano.config.floatX), name='delta_W') self.delta_b = theano.shared(value = numpy.zeros_like(self.b.get_value(borrow=True), dtype=theano.config.floatX), name='delta_b') self.output = T.dot(self.input, self.W) + self.b self.params = [self.W, self.b] self.delta_params = [self.delta_W, self.delta_b]
def adadelta(lr,tparams,grads,x,mask,y,cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] #梯度更新字典 param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def sigmoid_layer(input, n_in, n_out, rng): w_init = rng.uniform( low=-4 * np.sqrt(6.0 / (n_in + n_out)), high=4 * np.sqrt(6.0 / (n_in + n_out)), size=(n_in, n_out) ) W = theano.shared(np.asarray(w_init, dtype=theano.config.floatX), name="W", borrow=True) b = theano.shared(np.zeros((n_out,), dtype=theano.config.floatX), name="b", borrow=True) return T.nnet.sigmoid(T.dot(input, W) + b), [W, b]
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh): if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W_branches = theano.shared(value=W_values, name='W_branches', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b_1 = theano.shared(value=b_values, name='b_1', borrow=True) self.W_branches = W_branches self.b_1 = b_1 sub_branch_type = "" z_i = T.concatenate(self.W_branches[sub_branch_type] + self.W_branches[sub_branch_dist]) # self.output = self.params = [self.W_branches, self.b_1]
def setup(self, prev_layer): self.input_layer = prev_layer self.input = prev_layer.output self.W = theano.shared(np.random.random((self.input_layer.output_shape, self.output_shape)).astype(theano.config.floatX)*.01) self.b = theano.shared(np.zeros(self.output_shape,dtype=theano.config.floatX)) self.params = (self.W, self.b) self.output = self.activation(T.dot(self.input, self.W) + self.b.dimshuffle('x', 0))
def adadelta(lr, tparams, grads, inp, cost, extra_ups=[], extra_outs=[], exclude_params=set([])): '''Adadelta''' zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function( inp, [cost]+extra_outs, updates=zgup+rg2up+extra_ups, profile=profile) updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tools.itemlist(tparams), updir) if p.name not in exclude_params] if not isinstance(lr, list): lr = [lr] f_update = theano.function(lr, [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def __init__(self, kernel, max_iter = 10, max_diff = None): """ :param kernel: a function with a signature (expected, observed) -> a similarity measure that accepts symbolic theano expressions and returns them accordingly. See `crayimage.hotornot.em.kernels` for examples. :param max_iter: maximal number of iteration :param max_diff: stop iterations if maximal difference in weights from the previous iteration is smaller than `max_diff`. If None the check is not performed. """ self.original_shape = None self.kernel = kernel self.max_iter = max_iter self.max_diff = max_diff self.X = theano.shared( np.zeros(shape=(0, 0), dtype='float32') ) self.weights = theano.shared( np.ones(shape=(0, ), dtype='float32') ) canonical = T.sum(self.weights[:, None] * self.X, axis=0) / T.sum(self.weights) weights_updates = self.kernel(canonical, self.X) weights_diff = T.max(abs(weights_updates - self.weights)) upd = { self.weights : weights_updates } self.iteration = theano.function([], weights_diff if max_diff is not None else [], updates=upd) self.get_canonical = theano.function([], canonical)
def __init__(self, network, **kwargs): # due to the way that theano handles updates, we cannot update a # parameter twice during the same function call. so, instead of handling # everything in the updates for self.f_learn(...), we split the # parameter updates into two function calls. the first "prepares" the # parameters for the gradient computation by moving the entire model one # step according to the current velocity. then the second computes the # gradient at that new model position and performs the usual velocity # and parameter updates. self.params = network.params(**kwargs) self.momentum = kwargs.get('momentum', 0.5) # set up space for temporary variables used during learning. self._steps = [] self._velocities = [] for param in self.params: v = param.get_value() n = param.name self._steps.append(theano.shared(np.zeros_like(v), name=n + '_step')) self._velocities.append(theano.shared(np.zeros_like(v), name=n + '_vel')) # step 1. move to the position in parameter space where we want to # compute our gradient. prepare = [] for param, step, velocity in zip(self.params, self._steps, self._velocities): prepare.append((step, self.momentum * velocity)) prepare.append((param, param + step)) logging.info('compiling NAG adjustment function') self.f_prepare = theano.function([], [], updates=prepare) super(NAG, self).__init__(network, **kwargs)
def _init_params(self): self.W_hhs = [] self.W_shortp = [] for dx in xrange(self.n_layers): W_hh = self.init_fn[dx](self.n_hids[(dx-1)%self.n_layers], self.n_hids[dx], self.sparsity[dx], self.scale[dx], rng=self.rng) self.W_hhs.append(theano.shared(value=W_hh, name="W%d_%s" % (dx,self.name))) if dx > 0: W_shp = self.init_fn[dx](self.n_hids[self.n_layers-1], self.n_hids[dx], self.sparsity[dx], self.scale[dx], rng=self.rng) self.W_shortp.append(theano.shared(value=W_shp, name='W_s%d_%s'%(dx,self.name))) self.params = [x for x in self.W_hhs] +\ [x for x in self.W_shortp] self.params_grad_scale = [self.grad_scale for x in self.params] self.restricted_params = [x for x in self.params] if self.weight_noise: self.nW_hhs = [theano.shared(x.get_value()*0, name='noise_'+x.name) for x in self.W_hhs] self.nW_shortp = [theano.shared(x.get_value()*0, name='noise_'+x.name) for x in self.W_shortp] self.noise_params = [x for x in self.nW_hhs] + [x for x in self.nW_shortp] self.noise_params_shape_fn = [constant_shape(x.get_value().shape) for x in self.noise_params]
def check_parameter(name, value): parameters = set() constants = set() observeds = set() if isinstance(value, SharedVariable): parameters.add(value) elif isinstance(value, T.TensorConstant): constants.add(value) elif isinstance(value, T.TensorVariable): inputs = graph.inputs([value]) for var in inputs: if isinstance(var, SharedVariable): parameters.add(var) elif isinstance(var, T.TensorConstant): constants.add(var) elif isinstance(var, T.TensorVariable): if not var.name: raise ValueError("Observed variables must be named.") observeds.add(var) else: # XXX allow for lists and convert them to ndarray if isinstance(value, np.ndarray): value = theano.shared(value, name=name) else: value = theano.shared(float(value), name=name) parameters.add(value) return value, parameters, constants, observeds
def optimizer(loss, param): updates = OrderedDict() if param is not list: param = list(param) for param_ in param: i = theano.shared(np.array(0, dtype=theano.config.floatX)) i_int = i.astype('int64') value = param_.get_value(borrow=True) accu = theano.shared( np.zeros(value.shape + (n_win,), dtype=value.dtype)) grad = tt.grad(loss, param_) # Append squared gradient vector to accu_new accu_new = tt.set_subtensor(accu[:, i_int], grad ** 2) i_new = tt.switch((i + 1) < n_win, i + 1, 0) updates[accu] = accu_new updates[i] = i_new accu_sum = accu_new.sum(axis=1) updates[param_] = param_ - (learning_rate * grad / tt.sqrt(accu_sum + epsilon)) return updates
def generate_beta_arr(self, step1_beta): """ Generate the noise covariances, beta_t, for the forward trajectory. """ # lower bound on beta min_beta_val = 1e-6 min_beta_values = np.ones((self.trajectory_length,))*min_beta_val min_beta_values[0] += step1_beta min_beta = theano.shared(value=min_beta_values.astype(theano.config.floatX), name='min beta') # (potentially learned) function for how beta changes with timestep # TODO add beta_perturb_coefficients to the parameters to be learned beta_perturb_coefficients_values = np.zeros((self.n_temporal_basis,)) beta_perturb_coefficients = theano.shared( value=beta_perturb_coefficients_values.astype(theano.config.floatX), name='beta perturb coefficients') beta_perturb = T.dot(self.temporal_basis.T, beta_perturb_coefficients) # baseline behavior of beta with time -- destroy a constant fraction # of the original data variance each time step # NOTE 2 below means a fraction ~1/T of the variance will be left at the end of the # trajectory beta_baseline = 1./np.linspace(self.trajectory_length, 2., self.trajectory_length) beta_baseline_offset = util.logit_np(beta_baseline).astype(theano.config.floatX) # and the actual beta_t, restricted to be between min_beta and 1-[small value] beta_arr = T.nnet.sigmoid(beta_perturb + beta_baseline_offset) beta_arr = min_beta + beta_arr * (1 - min_beta - 1e-5) beta_arr = beta_arr.reshape((self.trajectory_length, 1)) return beta_arr
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., diag_c=0., lrate=0.01, n_words=100000, maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz', valid_dataset='../data/dev/newstest2011.en.tok', dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz.pkl', use_dropout=False, reload_=False): # Model options model_options = locals().copy() with open(dictionary, 'rb') as f: worddicts = pkl.load(f) worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(dataset, dictionary, n_words_source=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask] print 'Buliding sampler' f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' print 'Building f_grad...', f_grad = theano.function(inps, grads, profile=profile) print 'Done' lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', #import ipdb; ipdb.set_trace() if best_p != None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(5): sample, score = gen_sample(tparams, f_next, model_options, trng=trng, maxlen=30, argmax=False) print 'Sample ', jj, ': ', ss = sample for vv in ss: if vv == 0: break if vv in worddicts_r: print worddicts_r[vv], else: print 'UNK', print if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= numpy.array( history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Valid ', valid_err print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def create_recursive_unit(self): self.W_i = theano.shared( self.init_matrix([self.hidden_dim, self.emb_dim])) self.U_i = theano.shared( self.init_matrix([self.hidden_dim, self.hidden_dim])) self.b_i = theano.shared(self.init_vector([self.hidden_dim])) self.W_f = theano.shared( self.init_matrix([self.hidden_dim, self.emb_dim])) self.U_f = theano.shared( self.init_matrix([self.hidden_dim, self.hidden_dim])) self.b_f = theano.shared(self.init_vector([self.hidden_dim])) self.W_o = theano.shared( self.init_matrix([self.hidden_dim, self.emb_dim])) self.U_o = theano.shared( self.init_matrix([self.hidden_dim, self.hidden_dim])) self.b_o = theano.shared(self.init_vector([self.hidden_dim])) self.W_u = theano.shared( self.init_matrix([self.hidden_dim, self.emb_dim])) self.U_u = theano.shared( self.init_matrix([self.hidden_dim, self.hidden_dim])) self.b_u = theano.shared(self.init_vector([self.hidden_dim])) self.params.extend([ self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, self.W_u, self.U_u, self.b_u ]) def unit(parent_x, child_h, child_c, child_exists): h_tilde = T.sum(child_h, axis=0) i = T.nnet.sigmoid( T.dot(self.W_i, parent_x) + T.dot(self.U_i, h_tilde) + self.b_i) o = T.nnet.sigmoid( T.dot(self.W_o, parent_x) + T.dot(self.U_o, h_tilde) + self.b_o) u = T.tanh( T.dot(self.W_u, parent_x) + T.dot(self.U_u, h_tilde) + self.b_u) f = (T.nnet.sigmoid( T.dot(self.W_f, parent_x).dimshuffle('x', 0) + T.dot(child_h, self.U_f.T) + self.b_f.dimshuffle('x', 0)) * child_exists.dimshuffle(0, 'x')) c = i * u + T.sum(f * child_c, axis=0) h = o * T.tanh(c) return h, c return unit
def __init__(self, num_emb, tag_size, emb_dim, hidden_dim, output_dim, degree=2, learning_rate=0.01, momentum=0.9, trainable_embeddings=True, labels_on_nonroot_nodes=False, irregular_tree=False, pairwise=True): assert emb_dim > 1 and hidden_dim > 1 self.num_emb = num_emb self.tag_size = tag_size self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.degree = degree self.learning_rate = learning_rate self.L2_ratio = L2_RATIO self.Pairwise = pairwise self.params = [] np.random.seed(SEED) #self.embeddings = theano.shared(self.init_matrix([self.num_emb, self.emb_dim])) self.embeddings = theano.shared( self.init_matrix([self.num_emb, self.emb_dim])) self.params.append(self.embeddings) self.recursive_unit = self.create_recursive_unit() self.leaf_unit = self.create_leaf_unit() #self.output_fn = self.create_output_fn() self.score_fn = self.create_score_fn() self.x1 = T.ivector(name='x1') # word indices self.x2 = T.ivector(name='x2') # word indices self.tag_1 = T.ivector(name='tag1') # word indices self.tag_2 = T.ivector(name='tag2') # word indices self.x1_2 = T.ivector(name='x1_2') # word indices self.x2_1 = T.ivector(name='x2_1') # word indices self.num_words = self.x1.shape[0] self.emb_x1 = self.embeddings[self.x1] self.emb_x1 = self.emb_x1 * T.neq(self.x1, -1).dimshuffle( 0, 'x') # zero-out non-existent embeddings self.emb_x2 = self.embeddings[self.x2] self.emb_x2 = self.emb_x2 * T.neq(self.x2, -1).dimshuffle( 0, 'x') # zero-out non-existent embeddings self.tree_1 = T.imatrix(name='tree1') # shape [None, self.degree] self.tree_2 = T.imatrix(name='tree2') # shape [None, self.degree] self.tree_3 = T.imatrix(name='tree3') # shape [None, self.degree] self.tree_4 = T.imatrix(name='tree4') # shape [None, self.degree] self.tree_states_1, self.score1 = self.compute_tree( self.emb_x1, self.tree_1[:, :-1]) self.tree_states_2, self.score2 = self.compute_tree( self.emb_x2, self.tree_2[:, :-1]) #self._compute_emb = theano.function([self.x1,self.tree_1],self.tree_states_1) if self.Pairwise: self.forget_unit = self.create_forget_gate_fun() self._train_pairwise, self._predict_pair = self.create_pairwise_rank( ) else: self._predict, self._train_pointwise = self.create_pointwise_rank()
import theano matrix_times_vector = theano.function(inputs = [A, v], outputs = [w]) # let's import numpy so we can create real arrays import numpy as np A_val = np.array([[1, 2], [3, 4]]) v_val = np.array([5, 6]) w_val = matrix_times_vector(A_val, v_val) print(w_val) # let's create a shared variable to we can do gradient descent # this adds another layer of complexity to the theano function # the first argument is its initial value, the second is its name x = theano.shared(20.0, 'x') # a cost function that has a minimum value cost = x*x + x + 1 # in theano, you don't have to compute gradients yourself! x_update = x - 0.3 * T.grad(cost, x) # x is not an "input", it's a thing you update # in later examples, data and labels would go into the inputs # and model params would go in the updates # updates takes in a list of tuples, each tuple has 2 things in it: # 1) the shared variable to update, 2) the update expression train = theano.function(inputs = [], outputs = cost, updates = [(x, x_update)]) # write your own loop to call the training function.
def __init__( self, X_data: np.ndarray, Y_data: np.ndarray, data_type: str = 'float32', n_iter = 200000, learning_rate = 0.001, total_grad_norm_constraint = 200, verbose = True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, n_factors = 7, cutoff_poisson = 1000, h_alpha = 1 ): ############# Initialise parameters ################ super().__init__(X_data, 0, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id) self.Y_data = Y_data self.y_data = theano.shared(Y_data.astype(self.data_type)) self.n_rois = Y_data.shape[0] self.l_r = np.array([np.sum(X_data[i,:]) for i in range(self.n_rois)]).reshape(self.n_rois,1)/self.n_genes self.n_factors = n_factors self.n_npro = Y_data.shape[1] self.cutoff_poisson = cutoff_poisson self.poisson_residual = self.X_data < self.cutoff_poisson self.gamma_residual = self.X_data > self.cutoff_poisson self.X_data1 = self.X_data[self.poisson_residual] self.X_data2 = self.X_data[self.gamma_residual] self.genes = var_names self.sample_names = obs_names self.h_alpha = h_alpha ############# Define the model ################ self.model = pm.Model() with self.model: ### Negative Probe Counts ### # Prior for distribution of negative probe count levels: self.b_n_hyper = pm.Gamma('b_n_hyper', alpha = np.array((3,1)), beta = np.array((1,1)), shape = 2) self.b_n = pm.Gamma('b_n', mu = self.b_n_hyper[0], sigma = self.b_n_hyper[1], shape = (1,self.n_npro)) self.y_rn = self.b_n*self.l_r ### Gene Counts ### # Background for gene probes, drawn from the same distribution as negative probes: self.b_g = pm.Gamma('b_g', mu = self.b_n_hyper[0], sigma = self.b_n_hyper[1], shape = (1,self.n_genes)) # Gene expression modeled as combination of non-negative factors: self.h_hyp = pm.Gamma('h_hyp', 1, 1, shape = 1) self.h = pm.Gamma('h', alpha = 1, beta = self.h_hyp, shape=(self.n_genes, self.n_factors)) self.w_hyp = pm.Gamma('w_hyp', np.array((1,1)), np.array((1,1)), shape=(self.n_factors,2)) self.w = pm.Gamma('w', mu=self.w_hyp[:,0], sigma=self.w_hyp[:,1], shape=(self.n_rois, self.n_factors)) self.a_gr = pm.Deterministic('a_gr', pm.math.dot(self.w, self.h.T)) # Expected gene counts are sum of gene expression and background counts, scaled by library size: self.x_rg = (self.a_gr + self.b_g)*self.l_r self.data_target = pm.DensityDist('data_target', self.get_logDensity, observed=tt.concatenate([self.y_data, self.x_data], axis = 1))
def init_weights(shape): return theano.shared(floatX(np.random.randn(*shape) * 0.01))
def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape) # downsample each feature map individually, using maxpooling pooled_out = downsample.max_pool_2d(input=conv_out, ds=poolsize, ignore_border=True) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # store parameters of this layer self.params = [self.W, self.b] # keep track of model input self.input = input
def init_tparams(params): tparams = OrderedDict() for k, v in params.iteritems(): tparams[k] = theano.shared(v, name=k) return tparams
def __init__(self, We_initial, params): initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) if params.npc > 0: pc = theano.shared(np.asarray(params.pc, dtype=config.floatX)) g1batchindices = T.imatrix() g1mask = T.matrix() scores = T.matrix() l_in = lasagne.layers.InputLayer((None, None)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_average = lasagne_average_layer([l_emb, l_mask]) l_out = lasagne.layers.DenseLayer(l_average, params.layersize, nonlinearity=params.nonlinearity) embg = lasagne.layers.get_output(l_out, { l_in: g1batchindices, l_mask: g1mask }) if params.npc <= 0: print "#pc <=0, do not remove pc" elif params.npc == 1: print "#pc == 1" proj = embg.dot(pc.transpose()) embg = embg - theano.tensor.outer(proj, pc) else: print "#pc > 1" proj = embg.dot(pc.transpose()) embg = embg - theano.tensor.dot(proj, pc) l_in2 = lasagne.layers.InputLayer((None, params.layersize)) l_sigmoid = lasagne.layers.DenseLayer( l_in2, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid) l_softmax = lasagne.layers.DenseLayer(l_sigmoid, 2, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, {l_in2: embg}) cost = T.nnet.categorical_crossentropy(X, scores) prediction = T.argmax(X, axis=1) self.network_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) self.network_params.pop( 0) # do not include the word embedding as network parameters self.all_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) reg = self.getRegTerm(params, We, initial_We) self.trainable = self.getTrainableParams(params) cost = T.mean(cost) + reg self.feedforward_function = theano.function([g1batchindices, g1mask], embg) self.scoring_function = theano.function([g1batchindices, g1mask], prediction) self.cost_function = theano.function([scores, g1batchindices, g1mask], cost) grads = theano.gradient.grad(cost, self.trainable) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.trainable, params.eta) self.train_function = theano.function([scores, g1batchindices, g1mask], cost, updates=updates)
def init_tparams(params): tparams = OrderedDict() for kk, pp in params.iteritems(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams
def __init__(self, Nlayers = 1, # number of layers Ndirs = 1, # unidirectional or bidirectional Nx = 100, # input size Nh = 100, # hidden layer size Ny = 100, # output size Ah = "relu", # hidden unit activation (e.g. relu, tanh, lstm) Ay = "linear", # output unit activation (e.g. linear, sigmoid, softmax) predictPer = "frame", # frame or sequence loss = None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg = 0.0, # L1 regularization L2reg = 0.0, # L2 regularization momentum = 0.0, # SGD momentum seed = 15213, # random seed for initializing the weights frontEnd = None, # a lambda function for transforming the input filename = None, # initialize from file initParams = None, # initialize from given dict ): if filename is not None: # load parameters from file with smart_open(filename, "rb") as f: initParams = dill.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) # F*ck, locals()[k] = v doesn't work; I have to do this statically Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \ = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = ["Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"] for name in self.paramNames: value = locals()[name] setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wrec", rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam("Wup", rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay)) if Ah != "lstm": self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs))) else: self.addParam("Bhid", numpy.tile(numpy.hstack([full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3))]), (1, Ndirs))) self.addParam("Bout", zeros(Ny)) self.addParam("h0", zeros((Nlayers, Ndirs, Nh))) if Ah == "lstm": self.addParam("c0", zeros((Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [theano.shared(zeros(x.get_value().shape)) for x in self.params] # Build computation graph input = T.ftensor3() mask = T.imatrix() mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()] mask_float = [T.cast((mask % 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast((mask >= 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), # T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :Nh]) i_t = T.nnet.sigmoid(a[:, Nh : Nh * 2]) o_t = T.nnet.sigmoid(a[:, Nh * 2 : Nh * 3]) c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win) if i == 0 else h.dot(self.Wup[i-1])) + self.Bhid[i] rep = lambda x: T.extra_ops.repeat(x.reshape((1, -1)), h.shape[1], axis = 0) if Ah != "lstm": h = T.concatenate([theano.scan( fn = step_rnn, sequences = [h[:, :, Nh * d : Nh * (d+1)], mask_float[d]], outputs_info = [rep(self.h0[i, d])], non_sequences = [self.Wrec[i, d], rep(self.h0[i, d])], go_backwards = (d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2) else: h = T.concatenate([theano.scan( fn = step_lstm, sequences = [h[:, :, Nh * 4 * d : Nh * 4 * (d+1)], mask_float[d]], outputs_info = [rep(self.c0[i, d]), rep(self.h0[i, d])], non_sequences = [self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d])], go_backwards = (d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2) h = h.dimshuffle((1, 0, 2)) if predictPer == "sequence": h = T.concatenate([h[mask_int[1 - d]][:, Nh * d : Nh * (d+1)] for d in range(Ndirs)], axis = 1) output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout) # Compute loss function if loss is None: loss = {"linear": "mse", "sigmoid": "ce", "softmax": "ce_group"}[self.Ay] if loss == "ctc": label = T.imatrix() cost = ctc_cost(output, mask, label) else: if predictPer == "sequence": label = T.fmatrix() y = output t = label elif predictPer == "frame": label = T.ftensor3() indices = (mask >= 0).nonzero() y = output[indices] t = label[indices] cost = T.mean({ "ce": -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis = 1), "ce_group": -T.log((y * t).sum(axis = 1)), "mse": T.mean((y - t) ** 2, axis = 1), "hinge": T.mean(relu(1 - y * (t * 2 - 1)), axis = 1), "squared_hinge": T.mean(relu(1 - y * (t * 2 - 1)) ** 2, axis = 1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg # Compute updates for network parameters updates = [] lrate = T.fscalar() clip = T.fscalar() grad = T.grad(cost, self.params) grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad] if momentum > 0: for w, d, g in zip(self.params, self.dparams, grad_clipped): updates.append((w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) else: for w, g in zip(self.params, grad_clipped): updates.append((w, w - lrate * g)) # Create functions to be called from outside self.train = theano.function( inputs = [input, mask, label, lrate, clip], outputs = cost, updates = updates, ) self.predict = theano.function(inputs = [input, mask], outputs = output)
def adadelta(tparams, grads, x, mask, iVector, jVector, cost, options, d=None, y=None): zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_up2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] if options['demoSize'] > 0 and options['numYcodes'] > 0: f_grad_shared = theano.function([x, d, y, mask, iVector, jVector], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') elif options['demoSize'] == 0 and options['numYcodes'] > 0: f_grad_shared = theano.function([x, y, mask, iVector, jVector], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') elif options['demoSize'] > 0 and options['numYcodes'] == 0: f_grad_shared = theano.function([x, d, mask, iVector, jVector], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') else: f_grad_shared = theano.function([x, mask, iVector, jVector], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [ -T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def __init__(self, Mi, Mo, activation=T.nnet.relu): self.Mi = Mi self.Mo = Mo self.f = activation # Input gate weights Wxi = init_weight(Mi, Mo) Whi = init_weight(Mo, Mo) Wci = init_weight(Mo, Mo) bi = np.zeros(Mo) # Forget gate weights Wxf = init_weight(Mi, Mo) Whf = init_weight(Mo, Mo) Wcf = init_weight(Mo, Mo) bf = np.zeros(Mo) # Cell gate Wxc = init_weight(Mi, Mo) Whc = init_weight(Mo, Mo) bc = np.zeros(Mo) # Output gate Wxo = init_weight(Mi, Mo) Who = init_weight(Mo, Mo) Wco = init_weight(Mo, Mo) bo = np.zeros(Mo) c0 = np.zeros(Mo) h0 = np.zeros(Mo) #theano variables self.Wxi = theano.shared(Wxi) self.Whi = theano.shared(Whi) self.Wci = theano.shared(Wci) self.bi = theano.shared(bi) self.Wxf = theano.shared(Wxf) self.Whf = theano.shared(Whf) self.Wcf = theano.shared(Wcf) self.bf = theano.shared(bf) self.Wxc = theano.shared(Wxc) self.Whc = theano.shared(Whc) self.bc = theano.shared(bc) self.Wxo = theano.shared(Wxo) self.Who = theano.shared(Who) self.Wco = theano.shared(Wco) self.bo = theano.shared(bo) self.h0 = theano.shared(h0) self.c0 = theano.shared(c0) self.params = [self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf, self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who, self.Wco, self.bo , self.h0, self.c0 ]
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.0004 reg = 0.01 Xtrain = Xtrain.astype(np.float32) Ytrain = Ytrain.astype(np.float32) Xtest = Xtest.astype(np.float32) Ytest = Ytest.astype(np.float32) Ytrain_ind = y2indicator(Ytrain).astype(np.float32) Ytest_ind = y2indicator(Ytest).astype(np.float32) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # step 2: define theano variables and expressions thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') # we can use the built-in theano functions to do relu and softmax thZ = relu( thX.dot(W1) + b1) # relu is new in version 0.7.1 but just in case you don't have it thY = T.nnet.softmax(thZ.dot(W2) + b2) # define the cost function and prediction cost = -(thT * T.log(thY)).sum() + reg * ((W1 * W1).sum() + (b1 * b1).sum() + (W2 * W2).sum() + (b2 * b2).sum()) prediction = T.argmax(thY, axis=1) # step 3: training expressions and functions # we can just include regularization as part of the cost because it is also automatically differentiated! update_W1 = W1 - lr * T.grad(cost, W1) update_b1 = b1 - lr * T.grad(cost, b1) update_W2 = W2 - lr * T.grad(cost, W2) update_b2 = b2 - lr * T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[thX, thT], outputs=[cost, prediction], ) costs = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) costs.append(cost_val) plt.plot(costs) plt.show()
def __init__(self, n_in, n_out): """ In order to get this to work we need to be careful not to update the actor parameters when updatating the critic. This can be an issue when the Concatenating networks together. The first first network becomes a part of the second. Hoever you can still access the first network by itself but an updates on the sencond network will effect the first network. Care needs to be taken to make sure only the parameters of the second network are updated. """ batch_size = 32 state_length = n_in action_length = n_out # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, state_length) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, state_length) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.dmatrix("Action") Action.tag.test_value = np.random.rand(batch_size, action_length) # create a small convolutional neural network inputLayerActA = lasagne.layers.InputLayer((None, state_length), State) l_hid2ActA = lasagne.layers.DenseLayer( inputLayerActA, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActA = lasagne.layers.DenseLayer( l_hid2ActA, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActA = lasagne.layers.DenseLayer( l_hid3ActA, num_units=n_out, nonlinearity=lasagne.nonlinearities.linear) inputLayerA = lasagne.layers.InputLayer((None, state_length), State) concatLayer = lasagne.layers.ConcatLayer( [inputLayerA, self._l_outActA]) l_hid2A = lasagne.layers.DenseLayer( concatLayer, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3A = lasagne.layers.DenseLayer( l_hid2A, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outA = lasagne.layers.DenseLayer( l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((n_out,)) # self.updateTargetModel() inputLayerActB = lasagne.layers.InputLayer((None, state_length), State) l_hid2ActB = lasagne.layers.DenseLayer( inputLayerActB, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActB = lasagne.layers.DenseLayer( l_hid2ActB, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActB = lasagne.layers.DenseLayer( l_hid3ActB, num_units=n_out, nonlinearity=lasagne.nonlinearities.linear) inputLayerB = lasagne.layers.InputLayer((None, state_length), State) concatLayerB = lasagne.layers.ConcatLayer( [inputLayerB, self._l_outActB]) l_hid2B = lasagne.layers.DenseLayer( concatLayerB, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3B = lasagne.layers.DenseLayer( l_hid2B, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outB = lasagne.layers.DenseLayer( l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # print "Initial W " + str(self._w_o.get_value()) self._learning_rate = 0.001 self._discount_factor = 0.8 self._rho = 0.95 self._rms_epsilon = 0.001 self._weight_update_steps = 5 self._updates = 0 self._states_shared = theano.shared( np.zeros((batch_size, state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((batch_size, state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared( np.zeros((batch_size, n_out), dtype=theano.config.floatX), ) self._q_valsActA = lasagne.layers.get_output(self._l_outActA, State) self._q_valsActB = lasagne.layers.get_output(self._l_outActB, ResultState) self._q_valsActB2 = lasagne.layers.get_output(self._l_outActB, State) inputs_ = { State: self._states_shared, Action: self._q_valsActA, } self._q_valsA = lasagne.layers.get_output(self._l_outA, inputs_) inputs_ = { ResultState: self._next_states_shared, Action: self._q_valsActB, } self._q_valsB = lasagne.layers.get_output(self._l_outB, inputs_) self._q_func = self._q_valsA self._q_funcAct = self._q_valsActA self._q_funcB = self._q_valsB self._q_funcActB = self._q_valsActB # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True) self._target = (Reward + self._discount_factor * self._q_valsB) self._diff = self._target - self._q_valsA self._loss = 0.5 * self._diff**2 + ( 1e-4 * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)) self._loss = T.mean(self._loss) self._params = lasagne.layers.helper.get_all_params(self._l_outA)[-6:] self._actionParams = lasagne.layers.helper.get_all_params( self._l_outActA) self._givens_ = { State: self._states_shared, # ResultState: self._next_states_shared, Reward: self._rewards_shared, # Action: self._actions_shared, } self._actGivens = { State: self._states_shared, # ResultState: self._next_states_shared, # Reward: self._rewards_shared, # Action: self._actions_shared, } # SGD update #updates_ = lasagne.updates.rmsprop(loss, params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update # minimize Value function error self._updates_ = lasagne.updates.rmsprop( T.mean(self._q_func) + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)), self._params, self._learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) # actDiff1 = (Action - self._q_valsActB) #TODO is this correct? # actDiff = (actDiff1 - (Action - self._q_valsActA)) # actDiff = ((Action - self._q_valsActB2)) # Target network does not work well here? #self._actDiff = ((Action - self._q_valsActA)) # Target network does not work well here? #self._actLoss = 0.5 * self._actDiff ** 2 + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2)) #self._actLoss = T.mean(self._actLoss) # actionUpdates = lasagne.updates.rmsprop(actLoss + # (1e-4 * lasagne.regularization.regularize_network_params( # self._l_outActA, lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.01 * (-actLoss), self._rho, self._rms_epsilon) # Maximize wrt q function # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO actionUpdates = lasagne.updates.rmsprop( T.mean(self._q_func) + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2)), self._actionParams, self._learning_rate * 0.1, self._rho, self._rms_epsilon) self._train = theano.function([], [self._loss, self._q_func], updates=self._updates_, givens=self._givens_) # self._trainActor = theano.function([], [actLoss, self._q_valsActA], updates=actionUpdates, givens=actGivens) self._trainActor = theano.function([], [self._q_func], updates=actionUpdates, givens=self._actGivens) self._q_val = theano.function([], self._q_valsA, givens={State: self._states_shared}) self._q_action = theano.function([], self._q_valsActA, givens={State: self._states_shared}) inputs_ = [ State, Reward, # ResultState ] self._bellman_error = theano.function(inputs=inputs_, outputs=self._diff, allow_input_downcast=True)
def addParam(self, name, value): value = theano.shared(value) setattr(self, name, value) self.params.append(value) self.paramNames.append(name)
def sharedsfGpu(x,name=None): return T.cast( theano.shared(x,name=name), dtype=floatX )
def ini_net(self, outputShape, testData, modelSaver): self.net = NeuralNet( layers=[ ('input', layers.InputLayer), ('conv1', layers.Conv2DLayer), ('pool1', layers.MaxPool2DLayer), ('dropout1', layers.DropoutLayer), ('conv2', layers.Conv2DLayer), ('pool2', layers.MaxPool2DLayer), ('dropout2', layers.DropoutLayer), ('conv3', layers.Conv2DLayer), ('pool3', layers.MaxPool2DLayer), ('dropout3', layers.DropoutLayer), ('hidden4', layers.DenseLayer), ('dropout4', layers.DropoutLayer), ('hidden5', layers.DenseLayer), ('output', layers.DenseLayer), ], input_shape=(None, Settings.NN_CHANNELS, Settings.NN_INPUT_SHAPE[0], Settings.NN_INPUT_SHAPE[1] ), # variable batch size, 3 color shape row shape conv1_num_filters=32, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2), dropout1_p=0.1, conv2_num_filters=64, conv2_filter_size=(2, 2), pool2_pool_size=(2, 2), dropout2_p=0.2, conv3_num_filters=128, conv3_filter_size=(2, 2), pool3_pool_size=(2, 2), dropout3_p=0.3, hidden4_num_units=500, dropout4_p=0.5, hidden5_num_units=500, output_num_units=outputShape, output_nonlinearity=lasagne.nonlinearities.softmax, # optimization method: update=nesterov_momentum, update_learning_rate=theano.shared( utils.to_float32(Settings.NN_START_LEARNING_RATE)), update_momentum=theano.shared( utils.to_float32(Settings.NN_START_MOMENTUM)), batch_iterator_train=AugmentingLazyBatchIterator( Settings.NN_BATCH_SIZE, testData, "train", False, newSegmentation=False, loadingSize=(120, 120)), batch_iterator_test=LazyBatchIterator( Settings.NN_BATCH_SIZE, testData, "valid", False, newSegmentation=False, loadingInputShape=Settings.NN_INPUT_SHAPE), train_split=TrainSplit( eval_size=0.0), # we cross validate on our own regression=False, # classification problem on_epoch_finished=[ AdjustVariable('update_learning_rate', start=Settings.NN_START_LEARNING_RATE, stop=0.0001), AdjustVariable('update_momentum', start=Settings.NN_START_MOMENTUM, stop=0.999), TrainingHistory("?", str(self), [1], modelSaver), EarlyStopping(150), modelSaver, ], max_epochs=Settings.NN_EPOCHS, verbose=1, )
import theano_multi import numpy as np import theano import theano.tensor as T import time BATCH_SIZE = 512 DIM = 4096 x = T.matrix('x') y = T.matrix('y') W = theano.shared( np.random.normal(scale=0.01, size=(DIM, DIM)).astype(theano.config.floatX)) y_hat = x for i in xrange(50): y_hat = T.dot(y_hat, W) cost = T.mean((y_hat - y)**2) params = [W] grads = T.grad(cost, params) # grads = theano_multi.multi(grads, params=params, other_contexts=['dev2', 'dev1']) updates = [(p, p - 0.1 * g) for p, g in zip(params, grads)]
def sharedf(x, target=None, name=None,borrow=False): if target is None: return theano.shared(np.asarray(x,dtype=floatX), name=name, borrow=borrow, ) else: return theano.shared(np.asarray(x,dtype=floatX), target=target, name=name, borrow=borrow)
import numpy from theano import function, shared from theano import tensor as TT import theano sharedX = (lambda X, name: shared(numpy.asarray(X, dtype=theano.config.floatX), name=name)) def kinetic_energy(vel): """Returns the kinetic energy associated with the given velocity and mass of 1. Parameters ---------- vel: theano matrix Symbolic matrix whose rows are velocity vectors. Returns ------- return: theano vector Vector whose i-th entry is the kinetic entry associated with vel[i]. """ return 0.5 * (vel**2).sum(axis=1) def hamiltonian(pos, vel, energy_fn): """ Returns the Hamiltonian (sum of potential and kinetic energy) for the given
def sharedScalar(x,name=None): return theano.shared(x,name=name)
def main(reps, pretrained_w_path, do_module1, init_seed=0, load_t=0, num_epochs=200, batchsize=96, fine_tune=0, patience=500, lr_init=1e-3, optim='adagrad', toy=0, num_classes=23): res_root = '/home/hoa/Desktop/projects/resources' X_path = osp.join(res_root, 'datasets/msrcv2/Xaug_b01c.npy') Y_path = osp.join(res_root, 'datasets/msrcv2/Y.npy') MEAN_IMG_PATH = osp.join(res_root, 'models/ilsvrc_2012_mean.npy') snapshot = 50 # save model after every `snapshot` epochs drop_p = 0.5 # drop out prob. lambda2 = 0.0005 / 2 # l2-regularizer constant # step=patience/4 # decay learning after every `step` epochs lr_patience = 60 # for learning rate schedule, if optim=='momentum' if toy: # unit testing num_epochs = 10 data_multi = 3 reps = 2 #drop_p=0 #lambda2=0 # Create name tag for the experiment if fine_tune: full_or_tune = 'tune' # description tag for storing associated files else: full_or_tune = 'full' time_stamp = time.strftime("%y%m%d%H%M%S", time.localtime()) snapshot_root = '../snapshot_models/' snapshot_name = str(num_classes) + 'alex' + time_stamp + full_or_tune # LOADING DATA print 'LOADING DATA ...' X = np.load(X_path) Y = np.load(Y_path) if X.shape[1] != 3: X = b01c_to_bc01(X) N = len(Y) print 'Raw X,Y shape', X.shape, Y.shape if len(X) != len(Y): print 'Inconsistent number of input images and labels. X is possibly augmented.' MEAN_IMG = np.load(MEAN_IMG_PATH) MEAN_IMG_227 = skimage.transform.resize(np.swapaxes( np.swapaxes(MEAN_IMG, 0, 1), 1, 2), (227, 227), mode='nearest', preserve_range=True) MEAN_IMG = np.swapaxes(np.swapaxes(MEAN_IMG_227, 1, 2), 0, 1).reshape( (1, 3, 227, 227)) all_metrics = [] # store metrics in each run time_profiles = { 'train_module1': [], 'train_module1_eff': [], 'train_module2': [], 'test': [] } # record training and testing time # PREPARE THEANO EXPRESSION FOR BOTH MODULES print 'COMPILING THEANO EXPRESSION ...' input_var = T.tensor4('inputs') target_var = T.imatrix('targets') network = build_model(num_classes=num_classes, input_var=input_var) # Create a loss expression for training prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.binary_crossentropy(prediction, target_var) weights = lasagne.layers.get_all_params(network, regularizable=True) l2reg = theano.shared(floatX(lambda2)) * T.sum( [T.sum(w**2) for w in weights]) loss = loss.mean() + l2reg lr = theano.shared(np.array(lr_init, dtype=theano.config.floatX)) lr_decay = np.array(1. / 3, dtype=theano.config.floatX) # Create update expressions for training params = lasagne.layers.get_all_params(network, trainable=True) # last-layer case is actually very simple: # `params` above is a list of all (W,b)-pairs # Therefore last layer's (W,b) is params[-2:] if fine_tune == 7: # tuning params from fc7 to fc8 params = params[-2:] # elif fine_tune == 6: # tuning params from fc6 to fc8 # params = params[-4:] # TODO adjust for per-layer training with local_lr if optim == 'momentum': updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9) elif optim == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr, rho=0.9, epsilon=1e-06) elif optim == 'adam': updates = lasagne.updates.adam(loss, params, learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-08) elif optim == 'adagrad': updates = lasagne.updates.adagrad(loss, params, learning_rate=lr, epsilon=1e-06) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.binary_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() + l2reg # zero-one loss with threshold t = 0.5 for reference # zero_one_loss = T.abs_((test_prediction > theano.shared(floatX(0.5))) - target_var).sum(axis=1) #zero_one_loss /= target_var.shape[1].astype(theano.config.floatX) #zero_one_loss = zero_one_loss.mean() # Compile a function performing a backward pass (training step) on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: bwd_fn = theano.function( [input_var, target_var], loss, updates=updates, ) # Compile a second function performing a forward pass, # returns validation loss, 0/1 Error, score i.e. Xout: fwd_fn = theano.function([input_var, target_var], test_loss) # Create a theano function for computing score score = lasagne.layers.get_output(network, deterministic=True) score_fn = theano.function([input_var], score) def compute_score(X, Y, batchsize=batchsize, shuffle=False): out = np.zeros(Y.shape) batch_id = 0 for batch in iterate_minibatches(X, Y, batchsize, shuffle=False): inputs, _ = batch # Flip random half of the batch flip_idx = np.random.choice(len(inputs), size=len(inputs) / 2, replace=False) if len(flip_idx) > 1: inputs[flip_idx] = inputs[flip_idx, :, :, ::-1] # Substract mean image inputs = (inputs - MEAN_IMG).astype(theano.config.floatX) # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead if len(inputs) == batchsize: out[batch_id * batchsize:(batch_id + 1) * batchsize] = score_fn(inputs) batch_id += 1 else: out[batch_id * batchsize:] = score_fn(inputs) return out try: # MAIN LOOP FOR EACH RUN for seed in np.arange(reps) + init_seed: # reset learning rate lr.set_value(lr_init) print '\nRUN', seed, '...' # Split train/val/test set indicies = np.arange(len(Y)) Y_train_val, Y_test, idx_train_val, idx_test = train_test_split( Y, indicies, random_state=seed, train_size=float(2) / 3) Y_train, Y_val, idx_train, idx_val = train_test_split( Y_train_val, idx_train_val, random_state=seed) print "Train/val/test set size:", len(idx_train), len( idx_val), len(idx_test) idx_aug_train = data_aug(idx_train, mode='aug', isMat='idx', N=N) Xaug_train = X[idx_aug_train] Yaug_train = data_aug(Y_train, mode='aug', isMat='Y', N=N) idx_aug_val = data_aug(idx_val, mode='aug', isMat='idx', N=N) Xaug_val = X[idx_aug_val] Yaug_val = data_aug(Y_val, mode='aug', isMat='Y', N=N) # Module 2 training set is composed of module 1 training and validation set idx_aug_train_val = data_aug(idx_train_val, mode='aug', isMat='idx', N=N) Xaug_train_val = X[idx_aug_train_val] Yaug_train_val = data_aug(Y_train_val, mode='aug', isMat='Y', N=N) # Test set X_test = X[idx_test] # Y_test is already returned in the first train_test_split print "Augmented train/val/test set size:", len(Xaug_train), len( Yaug_val), len(X_test) print "Augmented (X,Y) dtype:", Xaug_train.dtype, Yaug_val.dtype print "Processed Mean image:", MEAN_IMG.dtype, MEAN_IMG.shape if toy: # try to overfit a tiny subset of the data Xaug_train = Xaug_train[:batchsize * data_multi + batchsize / 2] Yaug_train = Yaug_train[:batchsize * data_multi + batchsize / 2] Xaug_val = Xaug_val[:batchsize + batchsize / 2] Yaug_val = Yaug_val[:batchsize + batchsize / 2] # Init by pre-trained weights, if any if len(pretrained_w_path) > 0: layer_list = lasagne.layers.get_all_layers( network) # 22 layers if pretrained_w_path.endswith('pkl'): # load reference_net # use case: weights initialized from pre-trained reference nets f = open(pretrained_w_path, 'r') w_list = pickle.load(f) # list of 11 (W,b)-pairs f.close() lasagne.layers.set_all_param_values( layer_list[-3], w_list[:-2]) # exclude (W,b) of fc8 # BIG NOTE: don't be confused, it's pure coincident that layer_list # and w_list have the same index here. The last element of layer_list are # [.., fc6, drop6, fc7, drop7, fc8], while w_list are # [..., W, b, W, b, W, b] which, eg w_list[-4] and w_list[-3] correspond to # params that are associated with fc7 i.e. params that connect drop6 to fc7 elif pretrained_w_path.endswith('npz'): # load self-trained net # use case: continue training from a snapshot model with np.load( pretrained_w_path ) as f: # NOTE: only load snapshot of the same `seed` # w_list = [f['arr_%d' % i] for i in range(len(f.files))] w_list = [ f.items()['arr_%d' % i] for i in range(len(f.files)) ] # load from bkviz, one-time use lasagne.layers.set_all_param_values(network, w_list) elif pretrained_w_path.endswith( '/'): # init from 1 of the 30 snapshots from os import listdir import re files = [ f for f in listdir(pretrained_w_path) if osp.isfile(osp.join(pretrained_w_path, f)) ] for file_name in files: regex_seed = 'full%d_' % seed match_seed = re.search(regex_seed, file_name) if match_seed: regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+" match = re.search(regex, file_name) snapshot_name = match.group(0) print snapshot_name with np.load( osp.join(pretrained_w_path, snapshot_name) + '.npz') as f: w_list = [ f['arr_%d' % i] for i in range(len(f.files)) ] lasagne.layers.set_all_param_values( network, w_list) # START MODULE 1 module1_time = 0 if do_module1: print 'MODULE 1' training_history = {} training_history['iter_training_loss'] = [] training_history['iter_validation_loss'] = [] training_history['training_loss'] = [] training_history['validation_loss'] = [] training_history['learning_rate'] = [] # http://deeplearning.net/tutorial/gettingstarted.html#early-stopping # early-stopping parameters n_train_batches = Xaug_train.shape[0] / batchsize if Xaug_train.shape[0] % batchsize != 0: n_train_batches += 1 patience = patience # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found lr_patience_increase = 1.01 improvement_threshold = 0.995 # a relative improvement of this much is # considered significant; a significant test # MIGHT be better validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_params = None epoch_validation_loss = 0 # indicates that valid_loss has not been computed yet best_validation_loss = np.inf best_iter = -1 lr_iter = -1 test_score = 0. start_time = time.time() done_looping = False epoch = 0 # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: print( "\nEpoch\tTrain Loss\tValid Loss\tBest-ValLoss-and-Iter\tTime\tL.Rate" ) sys.setrecursionlimit(10000) try: # Early-stopping implementation while (not done_looping) and (epoch < num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(Xaug_train, Yaug_train, batchsize, shuffle=True): inputs, targets = batch # Horizontal flip half of the images bs = inputs.shape[0] indices = np.random.choice(bs, bs / 2, replace=False) inputs[indices] = inputs[indices, :, :, ::-1] # Substract mean image inputs = (inputs - MEAN_IMG).astype( theano.config.floatX) # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead train_err_batch = bwd_fn(inputs, targets) train_err += train_err_batch train_batches += 1 iter_now = epoch * n_train_batches + train_batches training_history['iter_training_loss'].append( train_err_batch) training_history['iter_validation_loss'].append( epoch_validation_loss) if (iter_now + 1) % validation_frequency == 0: # a full pass over the validation data: val_err = 0 #zero_one_err = 0 val_batches = 0 for batch in iterate_minibatches( Xaug_val, Yaug_val, batchsize, shuffle=False): inputs, targets = batch # Substract mean image inputs = (inputs - MEAN_IMG).astype( theano.config.floatX) # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead val_err_batch = fwd_fn(inputs, targets) val_err += val_err_batch val_batches += 1 epoch_validation_loss = val_err / val_batches if epoch_validation_loss < best_validation_loss: if epoch_validation_loss < best_validation_loss * improvement_threshold: patience = max( patience, iter_now * patience_increase) # lr_patience *= lr_patience_increase best_params = lasagne.layers.get_all_param_values( network) best_validation_loss = epoch_validation_loss best_iter = iter_now lr_iter = best_iter else: # decay learning rate if optim=='momentum' if optim == 'momentum' and ( iter_now - lr_iter) > lr_patience: lr.set_value(lr.get_value() * lr_decay) lr_iter = iter_now if patience <= iter_now: done_looping = True break # Record training history training_history['training_loss'].append(train_err / train_batches) training_history['validation_loss'].append( epoch_validation_loss) training_history['learning_rate'].append( lr.get_value()) epoch_time = time.time() - start_time module1_time += epoch_time # Then we print the results for this epoch: print("{}\t{:.6f}\t{:.6f}\t{:.6f}\t{}\t{:.3f}\t{}". format(epoch + 1, training_history['training_loss'][-1], training_history['validation_loss'][-1], best_validation_loss, best_iter + 1, epoch_time, training_history['learning_rate'][-1])) if ( epoch + 1 ) % snapshot == 0: # TODO try to save weights at best_iter snapshot_path_string = snapshot_root + snapshot_name + str( seed) + '_' + str(iter_now + 1) try: # use case: terminate experiment before reaching `reps` np.savez(snapshot_path_string + '.npz', *best_params) np.savez(snapshot_path_string + '_history.npz', training_history) plot_loss(training_history, snapshot_path_string + '_loss.png') # plot_conv_weights(lasagne.layers.get_all_layers(network)[1], # snapshot_path_string+'_conv1weights_') except KeyboardInterrupt, TypeError: print 'Did not save', snapshot_name + str( seed) + '_' + str(iter_now + 1) pass epoch += 1 except KeyboardInterrupt, MemoryError: # Sadly this can only catch KeyboardInterrupt pass print 'Training finished or KeyboardInterrupt (Training is never finished, only abandoned)' module1_time_eff = module1_time / iter_now * best_iter print('Total and Effective training time are {:.0f} and {:.0f}' ).format(module1_time, module1_time_eff) time_profiles['train_module1'].append(module1_time) time_profiles['train_module1_eff'].append(module1_time_eff) # Save model after num_epochs or KeyboardInterrupt if (epoch + 1) % snapshot != 0: # to avoid duplicate save snapshot_path_string = snapshot_root + snapshot_name + str( seed) + '_' + str(iter_now + 1) if not toy: try: # use case: terminate experiment before reaching `reps` print 'Saving model...' np.savez(snapshot_path_string + '.npz', *best_params) np.savez(snapshot_path_string + '_history.npz', training_history) plot_loss(training_history, snapshot_path_string + '_loss.png') # plot_conv_weights(lasagne.layers.get_all_layers(network)[1], # snapshot_path_string+'_conv1weights_') except KeyboardInterrupt, TypeError: print 'Did not save', snapshot_name + str( seed) + '_' + str(iter_now + 1) pass # And load them again later on like this: #with np.load('../snapshot_models/23alex16042023213910.npz') as f: # param_values = [f['arr_%d' % i] for i in range(len(f.files))] # or # training_history = f['arr_0'].items() # lasagne.layers.set_all_param_values(network, param_values) # END OF MODULE 1 # START MODULE 2 print '\nMODULE 2' if not do_module1: if pretrained_w_path.endswith('pkl'): snapshot_name = str( num_classes ) + 'alexOTS' # short for "off-the-shelf init" elif pretrained_w_path.endswith( 'npz'): # Resume from a SINGLE snapshot # extract name pattern, e.g. '23alex16042023213910full10' # from string '../snapshot_models/23alex16042023213910full10_100.npz' import re regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+" match = re.search(regex, pretrained_w_path) snapshot_name = match.group(0) elif pretrained_w_path.endswith( '/'): # RESUMED FROM TRAINED MODULE 1 (ONE-TIME USE) from os import listdir import re files = [ f for f in listdir(pretrained_w_path) if osp.isfile(osp.join(pretrained_w_path, f)) ] for file_name in files: regex_seed = 'full%d_' % seed match_seed = re.search(regex_seed, file_name) if match_seed: regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+" match = re.search(regex, file_name) snapshot_name = match.group(0) print snapshot_name with np.load( osp.join(pretrained_w_path, snapshot_name) + '.npz') as f: w_list = [ f['arr_%d' % i] for i in range(len(f.files)) ] lasagne.layers.set_all_param_values( network, w_list) else: # MAIN BRANCH - assume do_module1 is True AND have run `snapshot` epochs if (epoch + 1) > snapshot: with np.load(snapshot_path_string + '.npz' ) as f: # reload the best params for module 1 w_list = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, w_list) score_train = compute_score(Xaug_train_val, Yaug_train_val) start_time = time.time() if load_t: # Server failed at the wrong time. We only have t backed-up if pretrained_w_path.endswith('/'): from os import listdir import re files = [ f for f in listdir(pretrained_w_path) if osp.isfile(osp.join(pretrained_w_path, f)) ] for file_name in files: regex_seed = 'full%d_' % seed match_seed = re.search(regex_seed, file_name) if match_seed: regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+" match = re.search(regex, file_name) snapshot_name = match.group(0) t_train = np.load( osp.join('t', '{0}.npy'.format(snapshot_name))) else: # MAIN BRANCH thresholds = Threshold(score_train, Yaug_train_val) thresholds.find_t_for( ) # determine t_train for each score_train. It will take a while t_train = np.asarray(thresholds.t) print 't_train is in ', t_train.min(), '..', t_train.max() # `thresholds` holds t_train vector in .t attribute print('t_train produced in {:.3f}s').format(time.time() - start_time) np.save('t/' + snapshot_name + str(seed) + '.npy', t_train) # Predictive model for t regr = linear_model.RidgeCV(cv=5) # Ridge() is LinearClassifier() with L2-reg regr.fit(score_train, t_train) time_profiles['train_module2'].append(time.time() - start_time) # END OF MODULE 2 # TESTING PHASE start_time = time.time() score_test = compute_score(X_test, Y_test) t_test = regr.predict(score_test) print 'original t_test is in ', min(t_test), '..', max(t_test) t_test[t_test > 1] = max(t_test[t_test < 1]) t_test[t_test < 0] = min( t_test[t_test > 0]) # ! Keep t_test in [0,1] print 'corrected t_test is in ', min(t_test), '..', max(t_test) # Predict label metrics = predict_label(score_test, Y_test, t_test, seed, num_classes, verbose=1) time_profiles['test'].append(time.time() - start_time) all_metrics.append(metrics)
l4a = layers.DenseLayer(j3, n_outputs=4096, weights_std=0.001, init_bias_value=0.01, dropout=0.5, nonlinearity=layers.identity) l4b = layers.FeatureMaxPoolingLayer(l4a, pool_size=2, feature_dim=1, implementation='reshape') l4c = layers.DenseLayer(l4b, n_outputs=4096, weights_std=0.001, init_bias_value=0.01, dropout=0.5, nonlinearity=layers.identity) l4 = layers.FeatureMaxPoolingLayer(l4c, pool_size=2, feature_dim=1, implementation='reshape') # l5 = layers.DenseLayer(l4, n_outputs=37, weights_std=0.01, init_bias_value=0.0, dropout=0.5, nonlinearity=custom.clip_01) # nonlinearity=layers.identity) l5 = layers.DenseLayer(l4, n_outputs=37, weights_std=0.01, init_bias_value=0.1, dropout=0.5, nonlinearity=layers.identity) # l6 = layers.OutputLayer(l5, error_measure='mse') l6 = custom.OptimisedDivGalaxyOutputLayer(l5) # this incorporates the constraints on the output (probabilities sum to one, weighting, etc.) xs_shared = [theano.shared(np.zeros((1,1,1,1), dtype=theano.config.floatX)) for _ in xrange(num_input_representations)] idx = T.lscalar('idx') givens = { l0.input_var: xs_shared[0][idx*BATCH_SIZE:(idx+1)*BATCH_SIZE], l0_45.input_var: xs_shared[1][idx*BATCH_SIZE:(idx+1)*BATCH_SIZE], } compute_output = theano.function([idx], l6.predictions(dropout_active=False), givens=givens) print "Load model parameters" layers.set_param_values(l6, analysis['param_values']) print "Create generators"
def init_params_c2w2s(n_chars=N_CHAR): ''' Initialize all params for hierarchical GRU ''' params = OrderedDict() np.random.seed(0) prefix = 'c2w_' # lookup table params[prefix+'Wc'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(n_chars,CHAR_DIM)).astype('float32'), name=prefix+'Wc') # f-GRU params[prefix+'W_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_f_r') params[prefix+'W_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_f_z') params[prefix+'W_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_f_h') params[prefix+'b_f_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_f_r') params[prefix+'b_f_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_f_z') params[prefix+'b_f_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_f_h') params[prefix+'U_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_f_r') params[prefix+'U_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_f_z') params[prefix+'U_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_f_h') # b-GRU params[prefix+'W_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_b_r') params[prefix+'W_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_b_z') params[prefix+'W_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_b_h') params[prefix+'b_b_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_b_r') params[prefix+'b_b_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_b_z') params[prefix+'b_b_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_b_h') params[prefix+'U_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_b_r') params[prefix+'U_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_b_z') params[prefix+'U_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_b_h') # dense params[prefix+'W_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name=prefix+'W_df') params[prefix+'W_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name=prefix+'W_db') #params[prefix+'b_df'] = theano.shared(np.zeros((WDIM)).astype('float32'), name=prefix+'b_df') #params[prefix+'b_db'] = theano.shared(np.zeros((WDIM)).astype('float32'), name=prefix+'b_db') prefix = 'w2s_' # f-GRU params[prefix+'W_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_f_r') params[prefix+'W_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_f_z') params[prefix+'W_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_f_h') params[prefix+'b_f_r'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_f_r') params[prefix+'b_f_z'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_f_z') params[prefix+'b_f_h'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_f_h') params[prefix+'U_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_f_r') params[prefix+'U_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_f_z') params[prefix+'U_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_f_h') # b-GRU params[prefix+'W_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_b_r') params[prefix+'W_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_b_z') params[prefix+'W_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_b_h') params[prefix+'b_b_r'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_b_r') params[prefix+'b_b_z'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_b_z') params[prefix+'b_b_h'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_b_h') params[prefix+'U_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_b_r') params[prefix+'U_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_b_z') params[prefix+'U_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_b_h') # dense params[prefix+'W_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,SDIM)).astype('float32'), name=prefix+'W_df') params[prefix+'W_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,SDIM)).astype('float32'), name=prefix+'W_db') #params[prefix+'b_df'] = theano.shared(np.zeros((SDIM)).astype('float32'), name=prefix+'b_df') #params[prefix+'b_db'] = theano.shared(np.zeros((SDIM)).astype('float32'), name=prefix+'b_db') return params
FREQ_DICT['wrestle'] = 7*112.5 FREQ_DICT['resonate'] = 5*112.5 FREQ_DICT['seated'] = 3*112.5 FREQ_DICT['habitually'] = 1*112.5 ORDERED_FREQ = sorted(list(FREQ_DICT), key=lambda x:FREQ_DICT[x], reverse=True) def time_freq(freq): rehearsals = np.zeros((np.int(np.max(freq) * 113), len(freq))) for i in np.arange(len(freq)): temp = np.arange(np.int((freq[i]*112.5))) temp = temp * np.int(SEC_IN_TIME/(freq[i]*112.5)) rehearsals[:len(temp),i] = temp return(rehearsals.T) time = theano.shared(time_freq(FREQ), 'time') LEMMA_CHUNKS = [(actr.makechunk("", typename="word", form=word)) for word in ORDERED_FREQ] lex_decision.set_decmem({x: np.array([]) for x in LEMMA_CHUNKS}) lex_decision.goals = {} lex_decision.set_goal("g") lex_decision.productionstring(name="attend word", string=""" =g> isa goal state 'attend' =visual_location> isa _visuallocation ?visual>
def __init__(self, input=None, n_visible=784, n_hidden=500, W=None, hbias=None, vbias=None, numpy_rng=None, theano_rng=None): """ RBM initialization function Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing Contrastive Divergence updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network; in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias """ self.n_visible = n_visible self.n_hidden = n_hidden if numpy_rng is None: numpy_rng = numpy.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2**30)) if W is None: initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) W = theano.shared(value=initial_W, name='W', borrow=True) if hbias is None: hbias = theano.shared(value=numpy.zeros( n_hidden, dtype=theano.config.floatX), name='hbias', borrow=True) if vbias is None: vbias = theano.shared(value=numpy.zeros( n_visible, dtype=theano.config.floatX), name='vbias', borrow=True) self.input = input if not input: self.input = T.matrix('input') self.W = W self.hbias = hbias self.vbias = vbias self.theano_rng = theano_rng self.params = [self.W, self.hbias, self.vbias]
def init_params(n_chars=N_CHAR): ''' Initialize all params ''' params = OrderedDict() np.random.seed(0) # lookup table params['Wc'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(n_chars,CHAR_DIM)).astype('float32'), name='Wc') # f-GRU params['W_c2w_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_r') params['W_c2w_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_z') params['W_c2w_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_h') params['b_c2w_f_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_r') params['b_c2w_f_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_z') params['b_c2w_f_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_h') params['U_c2w_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_r') params['U_c2w_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_z') params['U_c2w_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_h') # b-GRU params['W_c2w_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_r') params['W_c2w_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_z') params['W_c2w_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_h') params['b_c2w_b_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_r') params['b_c2w_b_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_z') params['b_c2w_b_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_h') params['U_c2w_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_r') params['U_c2w_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_z') params['U_c2w_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_h') # dense params['W_c2w_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name='W_c2w_df') params['W_c2w_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name='W_c2w_db') params['b_c2w_df'] = theano.shared(np.zeros((WDIM)).astype('float32'), name='b_c2w_df') params['b_c2w_db'] = theano.shared(np.zeros((WDIM)).astype('float32'), name='b_c2w_db') return params
feats = 7 # number of input variables # generate a dataset: D = (input_values, target_class) D = (rng.randn(N, feats), 1 + rng.randn(N)) training_steps = 100 # Declare Theano symbolic variables x = T.dmatrix("x") y = T.dvector("y") # initialize the weight vector w randomly # # this and the following bias variable b # are shared so they keep their values # between training iterations (updates) w = theano.shared(rng.randn(feats), name="w") # initialize the bias term b = theano.shared(0., name="b") print("Initial model:") print(w.get_value()) print(b.get_value()) # Construct Theano expression graph y_pred = T.dot(x,w) - b prediction = y_pred cost = T.mean(T.sqr(y_pred - y)) gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost # w.r.t weight vector w and # bias term b
def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh, output_type='real'): self.input = input self.activation = activation self.output_type = output_type self.batch_size = T.iscalar() # theta is a vector of all trainable parameters # it represents the value of W, W_in, W_out, h0, bh, by theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \ n_hidden + n_hidden + n_out self.theta = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # Parameters are reshaped views of theta param_idx = 0 # pointer to somewhere along parameter vector # recurrent weights as a shared variable self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape( (n_hidden, n_hidden)) self.W.name = 'W' W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden**2 # input to hidden layer weights self.W_in = self.theta[param_idx:(param_idx + n_in * \ n_hidden)].reshape((n_in, n_hidden)) self.W_in.name = 'W_in' W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_in * n_hidden # hidden to output layer weights self.W_out = self.theta[param_idx:(param_idx + n_hidden * \ n_out)].reshape((n_hidden, n_out)) self.W_out.name = 'W_out' W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden * n_out self.h0 = self.theta[param_idx:(param_idx + n_hidden)] self.h0.name = 'h0' h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.bh = self.theta[param_idx:(param_idx + n_hidden)] self.bh.name = 'bh' bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.by = self.theta[param_idx:(param_idx + n_out)] self.by.name = 'by' by_init = np.zeros((n_out, ), dtype=theano.config.floatX) param_idx += n_out assert (param_idx == theta_shape) # for convenience self.params = [ self.W, self.W_in, self.W_out, self.h0, self.bh, self.by ] # shortcut to norms (for monitoring) self.l2_norms = {} for param in self.params: self.l2_norms[param] = T.sqrt(T.sum(param**2)) # initialize parameters # DEBUG_MODE gives division by zero error when we leave parameters # as zeros self.theta.set_value( np.concatenate([ x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init, bh_init, by_init) ])) self.theta_update = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # recurrent function (using tanh activation function) and arbitrary output # activation function def step(x_t, h_tm1): h_t = self.activation(T.dot(x_t, self.W_in) + \ T.dot(h_tm1, self.W) + self.bh) y_t = T.dot(h_t, self.W_out) + self.by return h_t, y_t # the hidden state `h` for the entire sequence, and the output for the # entire sequence `y` (first dimension is always time) # Note the implementation of weight-sharing h0 across variable-size # batches using T.ones multiplying h0 # Alternatively, T.alloc approach is more robust [self.h, self.y_pred], _ = theano.scan(step, sequences=self.input, outputs_info=[ T.alloc(self.h0, self.input.shape[1], n_hidden), None ]) # outputs_info=[T.ones(shape=(self.input.shape[1], # self.h0.shape[0])) * self.h0, None]) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = 0 self.L1 += abs(self.W.sum()) self.L1 += abs(self.W_in.sum()) self.L1 += abs(self.W_out.sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = 0 self.L2_sqr += (self.W**2).sum() self.L2_sqr += (self.W_in**2).sum() self.L2_sqr += (self.W_out**2).sum() if self.output_type == 'real': self.loss = lambda y: self.mse(y) elif self.output_type == 'binary': # push through sigmoid self.p_y_given_x = T.nnet.sigmoid(self.y_pred[-1]) # apply sigmoid self.y_out = T.round(self.p_y_given_x) # round to {0,1} self.loss = lambda y: self.nll_binary(y) elif self.output_type == 'softmax': # push through softmax, computing vector of class-membership # probabilities in symbolic form # # T.nnet.softmax will not operate on T.tensor3 types, only matrices # We take our n_steps x n_seq x n_classes output from the net # and reshape it into a (n_steps * n_seq) x n_classes matrix # apply softmax, then reshape back y_p = self.y_pred y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1)) y_p_s = T.nnet.softmax(y_p_m) self.p_y_given_x = T.reshape(y_p_s, y_p.shape) # compute prediction as class whose probability is maximal self.y_out = T.argmax(self.p_y_given_x, axis=-1) self.loss = lambda y: self.nll_multiclass(y) else: raise NotImplementedError