def __init__(self, nb_filters, stack_size, filter_height, filter_width, wide, name): """ Construct a convolutional layer `wide`: False: only apply filter to complete patches of the image. Generates output of shape: image_shape - filter_shape + 1 True: zero-pads image to multiple of filter shape to generate output of shape: image_shape + filter_shape - 1 """ self.nb_filters = nb_filters self.stack_size = stack_size self.filter_height = filter_height self.filter_width = filter_width self.wide = wide self.name = name self.filter_shape = (nb_filters, stack_size, filter_height, filter_width) fan_in = stack_size * filter_height * filter_width # number of inputs to each hidden unit fan_out = ((nb_filters * filter_height * filter_width) ) # each unit in the lower layer receives a gradient from drange = np.sqrt( 6. / (fan_in + fan_out)) # initialize filters with random values self.filters = create_shared( drange * random_weights(self.filter_shape), name + '__filters') self.bias = create_shared(np.zeros((nb_filters, )), name + '__bias') # parameters in the layer self.params = [self.filters, self.bias]
def __init__(self, nb_filters, stack_size, filter_height, wide, emb_dim, name): """ 1D convolutional layer: 1D Row-wise convolution. Requires to know the dimension of the embeddings. """ self.nb_filters = nb_filters self.stack_size = stack_size self.filter_height = filter_height self.wide = wide self.emb_dim = emb_dim self.filter_shape = (emb_dim, nb_filters, stack_size, filter_height, 1) # _TODO_ check initialization # fan_in = in_fmaps * 1 * width # fan_out = out_fmaps * 1 * width # W_bound = numpy.sqrt(6./(fan_in+fan_out)) filters_values = np.asarray(np.random.normal(0, 0.05, size=self.filter_shape), dtype=theano.config.floatX) self.filters = create_shared(filters_values, name + '__filters') self.bias = create_shared(np.zeros((nb_filters, emb_dim)), name + '__bias') # parameters in the layer self.params = [self.filters, self.bias]
def __init__(self, nb_filters, stack_size, filter_height, filter_width, border_mode, stride, name): """ Construct a convolutional layer. """ self.nb_filters = nb_filters self.stack_size = stack_size self.filter_height = filter_height self.filter_width = filter_width self.border_mode = border_mode self.filter_shape = (nb_filters, stack_size, filter_height, filter_width) self.stride = stride self.name = name fan_in = stack_size * filter_height * filter_width # number of inputs to each hidden unit fan_out = ((nb_filters * filter_height * filter_width) ) # each unit in the lower layer receives a gradient from drange = np.sqrt( 6. / (fan_in + fan_out)) # initialize filters with random values self.filters = create_shared( drange * random_weights(self.filter_shape), name + '__filters') self.bias = create_shared( np.ones((nb_filters, )) * 0.1, name + '__bias') # parameters in the layer self.params = [self.filters, self.bias]
def __init__(self, input_dim, output_dim, bias=True, activation='sigmoid', name='hidden_layer'): self.input_dim = input_dim self.output_dim = output_dim self.bias = bias self.name = name if activation is None: self.activation = None elif activation == 'tanh': self.activation = T.tanh elif activation == 'sigmoid': self.activation = T.nnet.sigmoid elif activation == 'softmax': self.activation = T.nnet.softmax elif activation == 'relu': self.activation = T.nnet.relu else: raise Exception("Unknown activation function: %s" % activation) # Initialize weights and bias self.weights = create_shared( random_weights((input_dim, output_dim)), name + '__weights' ) if activation == 'relu': self.bias = create_shared(np.ones((output_dim,)) * 0.1, name + '__bias') else: self.bias = create_shared(np.zeros((output_dim,)), name + '__bias') # Define parameters if self.bias: self.params = [self.weights, self.bias] else: self.params = [self.weights]
def __init__(self, nb_filters, stack_size, filter_height, wide, emb_dim, name): """ 1D convolutional layer: 1D Row-wise convolution. Requires to know the dimension of the embeddings. """ self.nb_filters = nb_filters self.stack_size = stack_size self.filter_height = filter_height self.wide = wide self.emb_dim = emb_dim self.filter_shape = (emb_dim, nb_filters, stack_size, filter_height, 1) # _TODO_ check initialization # fan_in = in_fmaps * 1 * width # fan_out = out_fmaps * 1 * width # W_bound = numpy.sqrt(6./(fan_in+fan_out)) filters_values = np.asarray( np.random.normal(0, 0.05, size=self.filter_shape), dtype=theano.config.floatX ) self.filters = create_shared(filters_values, name + '__filters') self.bias = create_shared(np.zeros((nb_filters, emb_dim)), name + '__bias') # parameters in the layer self.params = [self.filters, self.bias]
def trainer(X,Y,alpha,lr,predictions,updates,data,labels): data = U.create_shared(data, dtype=np.int8) labels = U.create_shared(labels,dtype=np.int8) index_start = T.lscalar('start') index_end = T.lscalar('end') print "Compiling function..." train_model = theano.function( inputs = [index_start,index_end,alpha,lr], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), updates = updates, givens = { X: data[index_start:index_end], Y: labels[index_start:index_end] } ) test_model = theano.function( inputs = [index_start,index_end], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), givens = { X: data[index_start:index_end], Y: labels[index_start:index_end] } ) print "Done." return train_model,test_model
def __init__(self, input_dim, output_dim, bias=True, activation='sigmoid', name='hidden_layer'): self.input_dim = input_dim self.output_dim = output_dim self.bias = bias self.name = name if activation is None: self.activation = None elif activation == 'tanh': self.activation = T.tanh elif activation == 'sigmoid': self.activation = T.nnet.sigmoid elif activation == 'softmax': self.activation = T.nnet.softmax elif activation == 'relu': self.activation = T.nnet.relu else: raise Exception("Unknown activation function: %s" % activation) # Initialize weights and bias self.weights = create_shared(random_weights((input_dim, output_dim)), name + '__weights') self.bias = create_shared(np.zeros((output_dim, )), name + '__bias') # Define parameters if self.bias: self.params = [self.weights, self.bias] else: self.params = [self.weights]
def __init__(self, input_dim, hidden_dim, activation=T.nnet.sigmoid, with_batch=True, name='RNN'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.activation = activation self.with_batch = with_batch self.name = name # Randomly generate weights self.w_x = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_x') self.w_h = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_h') # Initialize the bias vector and h_0 to zero vectors self.b_h = create_shared(np.zeros((hidden_dim, )), name + '__b_h') self.h_0 = create_shared(np.zeros((hidden_dim, )), name + '__h_0') # Define parameters self.params = [self.w_x, self.w_h, self.b_h, self.h_0]
def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.with_batch = with_batch self.name = name # Update gate weights and bias self.w_z = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_z') self.u_z = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__u_z') self.b_z = create_shared(np.zeros((hidden_dim,)), name + '__b_z') # Reset gate weights and bias self.w_r = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_r') self.u_r = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__u_r') self.b_r = create_shared(np.zeros((hidden_dim,)), name + '__b_r') # New memory content weights and bias self.w_c = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_c') self.u_c = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__u_c') self.b_c = create_shared(np.zeros((hidden_dim,)), name + '__b_c') # Initialize the bias vector, h_0, to the zero vector self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0') # Define parameters self.params = [self.w_z, self.u_z, self.b_z, self.w_r, self.u_r, self.b_r, self.w_c, self.u_c, self.b_c, self.h_0]
def __init__(self, nb_filters, stack_size, filter_height, filter_width, wide, name): """ Construct a convolutional layer `wide`: False: only apply filter to complete patches of the image. Generates output of shape: image_shape - filter_shape + 1 True: zero-pads image to multiple of filter shape to generate output of shape: image_shape + filter_shape - 1 """ self.nb_filters = nb_filters self.stack_size = stack_size self.filter_height = filter_height self.filter_width = filter_width self.wide = wide self.name = name self.filter_shape = (nb_filters, stack_size, filter_height, filter_width) fan_in = stack_size * filter_height * filter_width # number of inputs to each hidden unit fan_out = ((nb_filters * filter_height * filter_width)) # each unit in the lower layer receives a gradient from drange = np.sqrt(6. / (fan_in + fan_out)) # initialize filters with random values self.filters = create_shared(drange * random_weights(self.filter_shape), name + '__filters') self.bias = create_shared(np.zeros((nb_filters,)), name + '__bias') # parameters in the layer self.params = [self.filters, self.bias]
def adadelta(parameters, gradients, rho=np.float32(0.95), eps=np.float32(1e-6)): gradients_sq = [ U.create_shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in parameters ] deltas_sq = [ U.create_shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in parameters ] gradients_sq_new = [ rho * g_sq + (np.float32(1) - rho) * (g**2) for g_sq, g in izip(gradients_sq, gradients) ] deltas = [ (T.sqrt(d_sq + eps) / T.sqrt(g_sq + eps)) * grad for d_sq, g_sq, grad in izip(deltas_sq, gradients_sq_new, gradients) ] deltas_sq_new = [ rho * d_sq + (np.float32(1) - rho) * (d**2) for d_sq, d in izip(deltas_sq, deltas) ] gradient_sq_updates = zip(gradients_sq, gradients_sq_new) deltas_sq_updates = zip(deltas_sq, deltas_sq_new) parameters_updates = [(p, p - d) for p, d in izip(parameters, deltas)] return gradient_sq_updates + deltas_sq_updates + parameters_updates
def __init__(self,layers_in,layer_out): self.ins = layers_in self.out = layer_out self.Ws = [ U.create_shared(U.initial_weights(inp.size,self.out.size)) for inp in self.ins.layers ] self.bias = U.create_shared(np.zeros(self.out.size)) self.updates = self.Ws + [self.bias]
def momentum(parameters,gradients,mu,eps): t = U.create_shared(1) m = (1 - 3.0/(t+5) < mu) mu = m * (1 - 3.0/(t+5)) + (1-m) * mu deltas = [ U.create_shared(np.zeros(p.get_value().shape)) for p in parameters ] delta_nexts = [ mu*delta + eps*grad for delta,grad in zip(deltas,gradients) ] delta_updates = [ (delta, delta_next) for delta,delta_next in zip(deltas,delta_nexts) ] param_updates = [ (param, param - delta_next) for param,delta_next in zip(parameters,delta_nexts) ] return delta_updates + param_updates + [ (t,t + 1) ]
def __init__(self, layers_in, layer_out): self.ins = layers_in self.out = layer_out self.Ws = [ U.create_shared(U.initial_weights(inp.size, self.out.size)) for inp in self.ins.layers ] self.bias = U.create_shared(np.zeros(self.out.size)) self.updates = self.Ws + [self.bias]
def adadelta(parameters,gradients,rho=np.float32(0.95),eps=np.float32(1e-6)): gradients_sq = [ U.create_shared(np.zeros(p.get_value().shape,dtype=np.float32)) for p in parameters ] deltas_sq = [ U.create_shared(np.zeros(p.get_value().shape,dtype=np.float32)) for p in parameters ] gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g**2) for g_sq,g in izip(gradients_sq,gradients) ] deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in izip(deltas_sq,gradients_sq_new,gradients) ] deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d**2) for d_sq,d in izip(deltas_sq,deltas) ] gradient_sq_updates = zip(gradients_sq,gradients_sq_new) deltas_sq_updates = zip(deltas_sq,deltas_sq_new) parameters_updates = [ (p,p - d) for p,d in izip(parameters,deltas) ] return gradient_sq_updates + deltas_sq_updates + parameters_updates
def build_network(input_size,hidden_size): X = T.imatrix('X') W_input_to_hidden = U.create_shared(U.initial_weights(input_size,hidden_size)) W_hidden_to_output = U.create_shared(U.initial_weights(hidden_size,input_size)) b_output = U.create_shared(U.initial_weights(input_size)) hidden = T.nnet.sigmoid(T.dot(X,W_input_to_hidden)) output = T.nnet.softmax(T.dot(hidden,W_input_to_hidden.T) + b_output) parameters = [W_input_to_hidden,b_output] return X,output,parameters
def construct_network(context, characters, hidden, mult_hidden): print "Setting up memory..." X = T.bvector('X') Y = T.bvector('Y') alpha = T.cast(T.fscalar('alpha'), dtype=theano.config.floatX) lr = T.cast(T.fscalar('lr'), dtype=theano.config.floatX) print "Initialising weights..." W_char_hidden = U.create_shared(U.initial_weights(characters, hidden)) f_char_hidden = U.create_shared(U.initial_weights(characters, mult_hidden)) b_hidden = U.create_shared(U.initial_weights(hidden)) Wf_hidden = U.create_shared(U.initial_weights(hidden, mult_hidden)) fW_hidden = U.create_shared(U.initial_weights(mult_hidden, hidden)) W_hidden_predict = U.create_shared(U.initial_weights(hidden, characters)) b_predict = U.create_shared(U.initial_weights(characters)) print "Constructing graph..." hidden = make_hidden(hidden, W_char_hidden[X], f_char_hidden[X], Wf_hidden, fW_hidden, b_hidden) predictions = T.nnet.softmax(T.dot(hidden, W_hidden_predict) + b_predict) weights = [ W_char_hidden, f_char_hidden, b_hidden, Wf_hidden, fW_hidden, W_hidden_predict, b_predict ] cost = -T.mean(T.log(predictions)[T.arange(Y.shape[0]), Y]) gparams = T.grad(cost, weights) deltas = [U.create_shared(np.zeros(w.get_value().shape)) for w in weights] updates = [(param, param - (alpha * delta + gparam * lr)) for param, delta, gparam in zip(weights, deltas, gparams) ] + [(delta, alpha * delta + gparam * lr) for delta, gparam in zip(deltas, gparams)] return X, Y, alpha, lr, updates, predictions, weights
def build_network(input_size, hidden_size): X = T.imatrix('X') W_input_to_hidden = U.create_shared( U.initial_weights(input_size, hidden_size)) W_hidden_to_output = U.create_shared( U.initial_weights(hidden_size, input_size)) b_output = U.create_shared(U.initial_weights(input_size)) hidden = T.nnet.sigmoid(T.dot(X, W_input_to_hidden)) output = T.nnet.softmax(T.dot(hidden, W_input_to_hidden.T) + b_output) parameters = [W_input_to_hidden, b_output] return X, output, parameters
def momentum(parameters, gradients, mu, eps): t = U.create_shared(1) m = (1 - 3.0 / (t + 5) < mu) mu = m * (1 - 3.0 / (t + 5)) + (1 - m) * mu deltas = [ U.create_shared(np.zeros(p.get_value().shape)) for p in parameters ] delta_nexts = [ mu * delta + eps * grad for delta, grad in zip(deltas, gradients) ] delta_updates = [(delta, delta_next) for delta, delta_next in zip(deltas, delta_nexts)] param_updates = [(param, param - delta_next) for param, delta_next in zip(parameters, delta_nexts)] return delta_updates + param_updates + [(t, t + 1)]
def __init__(self, visible, hidden, **kwargs): kwargs['lambda_2'] = 0.0 self.v = visible self.h = hidden inputs = self.v.size outputs = self.h.size super(RBM,self).__init__(inputs,outputs,**kwargs) self.h_bias = self.bias self.h_bias_delta = self.bias_delta self.v_bias = U.create_shared(np.zeros(self.v.size)) self.v_bias_delta = U.create_shared(np.zeros(self.v.size)) self.tunables += [self.v_bias] self.deltas += [self.v_bias_delta]
def __init__(self, input_dim, hidden_dim, name='LSTM'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.name = name self.W = create_shared(random_weights((input_dim, hidden_dim * 4)), name + 'W') self.U = create_shared(random_weights((hidden_dim, hidden_dim * 4)), name + 'U') self.b = create_shared(random_weights((hidden_dim * 4, )), name + 'b') self.c_0 = create_shared(np.zeros((hidden_dim,)), name + '__c_0') self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0') self.params = [self.W, self.U, self.b]
def __init__(self, input_dim, hidden_dim, name='LSTM'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.name = name self.W = create_shared(random_weights((input_dim, hidden_dim * 4)), name + 'W') self.U = create_shared(random_weights((hidden_dim, hidden_dim * 4)), name + 'U') self.b = create_shared(random_weights((hidden_dim * 4, )), name + 'b') self.c_0 = create_shared(np.zeros((hidden_dim, )), name + '__c_0') self.h_0 = create_shared(np.zeros((hidden_dim, )), name + '__h_0') self.params = [self.W, self.U, self.b]
def rmsprop(parameters,gradients,discount=0.95,momentum=0.9,learning_rate=1e-4,epsilon=1e-4): #gradients = [ (g < -clip)*(-clip) + (g > clip)*(clip) + (abs(g) <= clip) * g for g in gradients ] sq_acc = [ U.create_shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters ] acc = [ U.create_shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters ] delta_acc = [ U.create_shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters ] sq_avg = [ discount * sq_a + (1 - discount) * g**2 for sq_a,g in izip(sq_acc,gradients) ] avg = [ discount * a + (1 - discount) * g for a, g in izip(acc,gradients) ] scaled_grads = [ g / T.sqrt(sq_a - a**2 + epsilon) for g,a,sq_a in izip(gradients,acc,sq_acc) ] deltas = [ momentum * d_a + learning_rate * s_g for d_a,s_g in izip(delta_acc,scaled_grads) ] sq_acc_updates = [ (sq_a, sq_aa) for sq_a,sq_aa in izip(sq_acc,sq_avg) ] acc_updates = [ (a, aa) for a, aa in izip(acc,avg) ] delta_updates = [ (d_a,d) for d_a,d in izip(delta_acc,deltas) ] parameters_updates = [ (p, p - d) for p,d in izip(parameters,deltas) ] return parameters_updates + acc_updates + sq_acc_updates + delta_updates
def rmsprop(parameters, gradients, discount=0.95, momentum=0.9, learning_rate=1e-4, epsilon=1e-4): #gradients = [ (g < -clip)*(-clip) + (g > clip)*(clip) + (abs(g) <= clip) * g for g in gradients ] sq_acc = [ U.create_shared( np.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters ] acc = [ U.create_shared( np.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters ] delta_acc = [ U.create_shared( np.zeros(p.get_value().shape, dtype=theano.config.floatX)) for p in parameters ] sq_avg = [ discount * sq_a + (1 - discount) * g**2 for sq_a, g in izip(sq_acc, gradients) ] avg = [discount * a + (1 - discount) * g for a, g in izip(acc, gradients)] scaled_grads = [ g / T.sqrt(sq_a - a**2 + epsilon) for g, a, sq_a in izip(gradients, acc, sq_acc) ] deltas = [ momentum * d_a + learning_rate * s_g for d_a, s_g in izip(delta_acc, scaled_grads) ] sq_acc_updates = [(sq_a, sq_aa) for sq_a, sq_aa in izip(sq_acc, sq_avg)] acc_updates = [(a, aa) for a, aa in izip(acc, avg)] delta_updates = [(d_a, d) for d_a, d in izip(delta_acc, deltas)] parameters_updates = [(p, p - d) for p, d in izip(parameters, deltas)] return parameters_updates + acc_updates + sq_acc_updates + delta_updates
def make_hidden_outputs(inputs,W): h0 = U.create_shared(np.zeros((HIDDEN,))) def step(score_t,self_tm1,W): return T.nnet.sigmoid(score_t + T.dot(self_tm1,W)) activation_probs,_ = theano.scan( step, sequences = inputs, outputs_info = h0, non_sequences = W ) return activation_probs
def __init__(self, input_dim, hidden_dim, activation=T.nnet.sigmoid, with_batch=True, name='RNN'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.activation = activation self.with_batch = with_batch self.name = name # Randomly generate weights self.w_x = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_x') self.w_h = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_h') # Initialize the bias vector and h_0 to zero vectors self.b_h = create_shared(np.zeros((hidden_dim,)), name + '__b_h') self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0') # Define parameters self.params = [self.w_x, self.w_h, self.b_h, self.h_0]
def __init__(self,inputs,outputs, lr = 0.1, batch_size = 10, max_epochs = 100000, momentum = 0.5, validation = 0.1, lambda_2 = 0.001, lr_min = 0.1): self.momentum = momentum self.lr = lr self.lr_min = lr_min self.batch_size = batch_size self.validation = validation self.max_epochs = max_epochs self.lambda_2 = lambda_2 self.W = U.create_shared(U.initial_weights(inputs,outputs)) self.W_delta = U.create_shared(np.zeros((inputs,outputs))) self.bias = U.create_shared(np.zeros(outputs)) self.bias_delta = U.create_shared(np.zeros(outputs)) self.tunables = [self.W, self.bias] self.deltas = [self.W_delta, self.bias_delta]
def build_network(input_size, hidden_size): X = T.dmatrix('X') W_input_to_hidden = U.create_shared( U.initial_weights(input_size, hidden_size)) W_hidden_to_hidden = U.create_shared( U.initial_weights(hidden_size, hidden_size)) b_hidden = U.create_shared(U.initial_weights(hidden_size)) # initial_hidden = U.create_shared(U.initial_weights(hidden_size)) initial_hidden = U.create_shared(U.initial_weights(hidden_size)) # W_hidden_to_hidden_reproduction = W_hidden_to_hidden.T#U.create_shared(U.initial_weights(hidden_size,hidden_size)) b_hidden_reproduction = U.create_shared(U.initial_weights(hidden_size)) W_hidden_to_input_reproduction = W_input_to_hidden.T #U.create_shared(U.initial_weights(hidden_size,input_size)) b_input_reproduction = U.create_shared(U.initial_weights(input_size)) parameters = [ W_input_to_hidden, W_hidden_to_hidden, b_hidden, initial_hidden, b_hidden_reproduction, b_input_reproduction, ] hidden, hidden1_reproduction, input_reproduction = make_rae( X, W_input_to_hidden, W_hidden_to_hidden, b_hidden, initial_hidden, b_hidden_reproduction, b_input_reproduction) unrolled = unroll(hidden[-1], W_input_to_hidden, W_hidden_to_hidden, b_hidden_reproduction, b_input_reproduction, hidden.shape[0]) return X, parameters, hidden, hidden1_reproduction, input_reproduction, unrolled
def fit(self,X,Y=None): print "Splitting validation and training set..." training_count = int(X.shape[0]*(1-self.validation)) validate_count = X.shape[0] - training_count n_train_batches = int(math.ceil(training_count/float(self.batch_size))) print "Setting up shared training memory..." train_x = U.create_shared(X[:training_count]) valid_x = U.create_shared(X[training_count:]) if Y != None: train_y = T.cast(U.create_shared(Y[:training_count]),'int32') valid_y = T.cast(U.create_shared(Y[training_count:]),'int32') else: train_y = valid_y = None print "Total examples:", X.shape[0] print "train examples:", training_count print "valid examples:", validate_count print "batches: ", n_train_batches print "batch size: ", self.batch_size self.train(*self.prepare_functions(n_train_batches,train_x,valid_x,train_y,valid_y))
def make_hidden_predict_outputs(hidden_size,characters_size, inputs,gen_mask, W_i,b_i,W_o,b_o,W_pred,b_pred,W_back): h0 = U.create_shared(np.zeros(hidden_size)) p0 = U.create_shared(np.zeros(characters_size)) def step(score_t,gm,hidden_1,predict_1,W_i,b_i,W_o,b_o,W_pred,b_pred,W_back): hidden = T.nnet.sigmoid( # (T.dot(hidden_1,W_i) + b_i ) + \ (1-gm) * ( T.dot(hidden_1,W_i) + b_i ) + \ (gm ) * ( T.dot(hidden_1,W_o) + b_o ) + \ T.dot(predict_1,W_back) + \ score_t ) predict = T.nnet.softmax(T.dot(hidden,W_pred) + b_pred)[0] return hidden,predict [hidden_,predict_],_ = theano.scan( step, sequences = [inputs,gen_mask], outputs_info = [h0,p0], non_sequences = [W_i,b_i,W_o,b_o,W_pred,b_pred,W_back] ) return hidden_,predict_
def make_hidden(hidden_size,add_ins,mult_ins,Wf,fW,b): h0 = U.create_shared(np.zeros(hidden_size)) def step(add_in,mult_in,hidden_1,Wf,fW,b): mult_W = T.dot(Wf * mult_in,fW) hidden_score = add_in + T.dot(hidden_1,mult_W) + b return T.nnet.sigmoid(hidden_score) hidden,_ = theano.scan( step, sequences = [add_ins,mult_ins], outputs_info = [h0], non_sequences = [Wf,fW,b] ) return hidden
def __init__(self, nb_filters, stack_size, filter_height, filter_width, border_mode, stride, name): """ Construct a convolutional layer. """ self.nb_filters = nb_filters self.stack_size = stack_size self.filter_height = filter_height self.filter_width = filter_width self.border_mode = border_mode self.filter_shape = (nb_filters, stack_size, filter_height, filter_width) self.stride = stride self.name = name fan_in = stack_size * filter_height * filter_width # number of inputs to each hidden unit fan_out = ((nb_filters * filter_height * filter_width)) # each unit in the lower layer receives a gradient from drange = np.sqrt(6. / (fan_in + fan_out)) # initialize filters with random values self.filters = create_shared(drange * random_weights(self.filter_shape), name + '__filters') self.bias = create_shared(np.ones((nb_filters,)) * 0.1, name + '__bias') # parameters in the layer self.params = [self.filters, self.bias]
def make_hidden(hidden_size, add_ins, mult_ins, Wf, fW, b): h0 = U.create_shared(np.zeros(hidden_size)) def step(add_in, mult_in, hidden_1, Wf, fW, b): mult_W = T.dot(Wf * mult_in, fW) hidden_score = add_in + T.dot(hidden_1, mult_W) + b return T.nnet.sigmoid(hidden_score) hidden, _ = theano.scan(step, sequences=[add_ins, mult_ins], outputs_info=[h0], non_sequences=[Wf, fW, b]) return hidden
def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.with_batch = with_batch self.name = name # Update gate weights and bias self.w_z = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_z') self.u_z = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__u_z') self.b_z = create_shared(np.zeros((hidden_dim, )), name + '__b_z') # Reset gate weights and bias self.w_r = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_r') self.u_r = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__u_r') self.b_r = create_shared(np.zeros((hidden_dim, )), name + '__b_r') # New memory content weights and bias self.w_c = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_c') self.u_c = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__u_c') self.b_c = create_shared(np.zeros((hidden_dim, )), name + '__b_c') # Initialize the bias vector, h_0, to the zero vector self.h_0 = create_shared(np.zeros((hidden_dim, )), name + '__h_0') # Define parameters self.params = [ self.w_z, self.u_z, self.b_z, self.w_r, self.u_r, self.b_r, self.w_c, self.u_c, self.b_c, self.h_0 ]
def trainer(X, Y, alpha, lr, predictions, updates, data, labels): data = U.create_shared(data, dtype=np.int8) labels = U.create_shared(labels, dtype=np.int8) index_start = T.lscalar('start') index_end = T.lscalar('end') print "Compiling function..." train_model = theano.function(inputs=[index_start, index_end, alpha, lr], outputs=T.mean( T.neq(T.argmax(predictions, axis=1), Y)), updates=updates, givens={ X: data[index_start:index_end], Y: labels[index_start:index_end] }) test_model = theano.function(inputs=[index_start, index_end], outputs=T.mean( T.neq(T.argmax(predictions, axis=1), Y)), givens={ X: data[index_start:index_end], Y: labels[index_start:index_end] }) print "Done." return train_model, test_model
def build_network(input_size, hidden_size): srng = RandomStreams(seed=12345) X = T.fmatrix('X') W_input_to_hidden1 = U.create_shared( U.initial_weights(input_size, hidden_size)) b_hidden1 = U.create_shared(U.initial_weights(hidden_size)) W_hidden1_to_output = U.create_shared(U.initial_weights(hidden_size)) b_output = U.create_shared(U.initial_weights(1)[0]) def network(training): hidden1 = T.dot(X, W_input_to_hidden1) + b_hidden1 hidden1 = hidden1 * (hidden1 > 0) if training: hidden1 = hidden1 * srng.binomial(size=(hidden_size, ), p=0.5) else: hidden1 = 0.5 * hidden1 output = T.nnet.sigmoid(T.dot(hidden1, W_hidden1_to_output) + b_output) return output parameters = [W_input_to_hidden1, b_hidden1, W_hidden1_to_output, b_output] return X, network(True), network(False), parameters
def construct_network(context,characters,hidden,mult_hidden): print "Setting up memory..." X = T.bvector('X') Y = T.bvector('Y') alpha = T.cast(T.fscalar('alpha'),dtype=theano.config.floatX) lr = T.cast(T.fscalar('lr'), dtype=theano.config.floatX) print "Initialising weights..." W_char_hidden = U.create_shared(U.initial_weights(characters,hidden)) f_char_hidden = U.create_shared(U.initial_weights(characters,mult_hidden)) b_hidden = U.create_shared(U.initial_weights(hidden)) Wf_hidden = U.create_shared(U.initial_weights(hidden,mult_hidden)) fW_hidden = U.create_shared(U.initial_weights(mult_hidden,hidden)) W_hidden_predict = U.create_shared(U.initial_weights(hidden,characters)) b_predict = U.create_shared(U.initial_weights(characters)) print "Constructing graph..." hidden = make_hidden( hidden, W_char_hidden[X], f_char_hidden[X], Wf_hidden, fW_hidden, b_hidden ) predictions = T.nnet.softmax(T.dot(hidden,W_hidden_predict) + b_predict) weights = [ W_char_hidden, f_char_hidden, b_hidden, Wf_hidden, fW_hidden, W_hidden_predict, b_predict ] cost = -T.mean(T.log(predictions)[T.arange(Y.shape[0]),Y]) gparams = T.grad(cost,weights) deltas = [ U.create_shared(np.zeros(w.get_value().shape)) for w in weights ] updates = [ ( param, param - ( alpha * delta + gparam * lr ) ) for param,delta,gparam in zip(weights,deltas,gparams) ] + [ ( delta, alpha * delta + gparam * lr) for delta,gparam in zip(deltas,gparams) ] return X,Y,alpha,lr,updates,predictions,weights
def __init__(self, input_dim, output_dim, name='embedding_layer'): """ Typically, input_dim is the vocabulary size, and output_dim the embedding dimension. """ self.input_dim = input_dim self.output_dim = output_dim self.name = name # Randomly generate weights self.embeddings = create_shared( random_weights((input_dim, output_dim)), self.name + '__embeddings') # Define parameters self.params = [self.embeddings]
def __init__(self, input_dim, output_dim, name='embedding_layer'): """ Typically, input_dim is the vocabulary size, and output_dim the embedding dimension. """ self.input_dim = input_dim self.output_dim = output_dim self.name = name # Randomly generate weights self.embeddings = create_shared( random_weights((input_dim, output_dim)), self.name + '__embeddings' ) # Define parameters self.params = [self.embeddings]
def build_network(input_size,hidden_size): X = T.dmatrix('X') W_input_to_hidden = U.create_shared(U.initial_weights(input_size,hidden_size)) W_hidden_to_hidden = U.create_shared(U.initial_weights(hidden_size,hidden_size)) b_hidden = U.create_shared(U.initial_weights(hidden_size)) # initial_hidden = U.create_shared(U.initial_weights(hidden_size)) initial_hidden = U.create_shared(U.initial_weights(hidden_size)) # W_hidden_to_hidden_reproduction = W_hidden_to_hidden.T#U.create_shared(U.initial_weights(hidden_size,hidden_size)) b_hidden_reproduction = U.create_shared(U.initial_weights(hidden_size)) W_hidden_to_input_reproduction = W_input_to_hidden.T#U.create_shared(U.initial_weights(hidden_size,input_size)) b_input_reproduction = U.create_shared(U.initial_weights(input_size)) parameters = [ W_input_to_hidden, W_hidden_to_hidden, b_hidden, initial_hidden, b_hidden_reproduction, b_input_reproduction, ] hidden, hidden1_reproduction, input_reproduction = make_rae( X, W_input_to_hidden, W_hidden_to_hidden, b_hidden, initial_hidden, b_hidden_reproduction, b_input_reproduction ) unrolled = unroll( hidden[-1], W_input_to_hidden, W_hidden_to_hidden, b_hidden_reproduction, b_input_reproduction, hidden.shape[0] ) return X,parameters,hidden,hidden1_reproduction,input_reproduction,unrolled
row += 1 prev += i return dense def sparse_dot(l, prev, values, W): row_data = values[T.arange(prev, prev + l)] row_weights = W[row_data[:, 0]] sum_weights = T.sum(row_weights * row_data[:, 1].reshape((l, 1)), axis=0) return sum_weights, prev + l if __name__ == "__main__": M = [[(1, 2), (5, 3), (10, 1)], [(0, 2), (3, 1)], [(2, 2), (8, 4)]] index = T.ivector('index') values = T.imatrix('values') prev = T.iscalar('prev') initial_weights = U.initial_weights(11, 3) W = U.create_shared(initial_weights) [output, _], updates = theano.scan(sparse_dot, sequences=index, outputs_info=[None, prev], non_sequences=[values, W]) f = theano.function(inputs=[index, values, prev], outputs=output) ind, val = to_sparse_array(M) print ind, val print f(ind, val, 0)
import theano import theano.tensor as T import numpy as np import utils as U from numpy_hinton import print_arr from theano.printing import Print W1 = U.create_shared(U.initial_weights(10, 10)) W2 = U.create_shared(U.initial_weights(10, 10)) b = U.create_shared(U.initial_weights(10)) X = T.dmatrix('X') def pair_combine(X): def step(i, inputs): length = inputs.shape[0] next_level = T.dot(inputs[T.arange(0, length - i - 1)], W1) + T.dot( inputs[T.arange(1, length - i)], W2) + b next_level = next_level * (next_level > 0) #next_level = inputs[T.arange(0,length-i-1)] + inputs[T.arange(1,length-i)] #next_level = theano.printing.Print('inputs')(next_level) return T.concatenate( [next_level, T.zeros_like(inputs[:length - next_level.shape[0]])]) combined, _ = theano.scan(step, sequences=[T.arange(X.shape[0])], outputs_info=[X], n_steps=X.shape[0] - 1) return combined[-1, 0], combined[0][:-1]
row_data = values[T.arange(prev,prev+l)] row_weights = W[row_data[:,0]] sum_weights = T.sum(row_weights*row_data[:,1].reshape((l,1)),axis=0) return sum_weights,prev+l if __name__ == "__main__": M = [[(1,2),(5,3),(10,1)], [(0,2),(3,1)], [(2,2),(8,4)]] index = T.ivector('index') values = T.imatrix('values') prev = T.iscalar('prev') initial_weights = U.initial_weights(11,3) W = U.create_shared(initial_weights) [output,_],updates = theano.scan( sparse_dot, sequences = index, outputs_info = [None,prev], non_sequences = [values,W] ) f = theano.function( inputs = [index,values,prev], outputs = output ) ind,val = to_sparse_array(M) print ind,val
def __init__(self, input_dim, hidden_dim, output_emb_dim, output_dim, with_batch=True, name='LSTM'): """ Initialize neural network. - input_dim: dimension of input vectors - hidden_dim: dimension of hidden vectors - output_emb_dim: dimension of output embeddings - output_dim: number of possible outputs """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_emb_dim = output_emb_dim self.output_dim = output_dim self.with_batch = with_batch self.name = name # Input gate weights self.w_xi = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xi') self.w_hi = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hi') self.w_yi = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yi') self.w_ci = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ci') # Forget gate weights self.w_xf = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xf') self.w_hf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hf') self.w_yf = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yf') self.w_cf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_cf') # Output gate weights self.w_xo = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xo') self.w_ho = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ho') self.w_yo = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yo') self.w_co = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_co') # Cell weights self.w_xc = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xc') self.w_hc = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hc') self.w_yc = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yc') # Initialize the bias vectors, c_0 and h_0 to zero vectors self.b_i = create_shared(np.zeros((hidden_dim,)), name + '__b_i') self.b_f = create_shared(np.zeros((hidden_dim,)), name + '__b_f') self.b_c = create_shared(np.zeros((hidden_dim,)), name + '__b_c') self.b_o = create_shared(np.zeros((hidden_dim,)), name + '__b_o') self.c_0 = create_shared(np.zeros((hidden_dim,)), name + '__c_0') self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0') # self.y_0 = create_shared(np.zeros((output_emb_dim,)), name + '__y_0') # Weights for projection to final output, and outputs embeddings self.embeddings = create_shared(random_weights((output_dim + 1, output_emb_dim)), name + '__embeddings') self.weights = create_shared(random_weights((hidden_dim, output_dim)), name + '__weights') self.bias = create_shared(random_weights((output_dim,)), name + '__bias') # Define parameters self.params = [self.w_xi, self.w_hi, self.w_yi, self.w_ci, self.w_xf, self.w_hf, self.w_yf, self.w_cf, self.w_xo, self.w_ho, self.w_yo, self.w_co, self.w_xc, self.w_hc, self.w_yc, self.b_i, self.b_c, self.b_o, self.b_f, self.c_0, self.h_0, # self.y_0, self.embeddings, self.weights, self.bias]
def link(self, input): """ Propagate the input through the network and return the last hidden vector. The whole sequence is also accessible through self.h """ def recurrence_strength(j, s_tm1_i, current_sum, _, d_t, u_t): s_t_i = T.maximum(0, s_tm1_i - T.maximum(0, u_t - current_sum)) return current_sum + s_tm1_i, T.switch(T.eq(j, 0), d_t, s_t_i) def recurrence_read(s_t_i, v_t_i, current_sum, current_read): new_read = T.minimum(s_t_i, T.maximum(0, 1 - current_sum)) * v_t_i return current_sum + s_t_i, current_read + new_read def recurrence(i, x_t, r_tm1, h_tm1, strengths, values): updates = {} # Controller - compute O'_t' controller_input = T.concatenate([x_t, r_tm1, h_tm1]) controller_output = T.tanh(T.dot(controller_input, self.w_xrh_hop) + self.b_xrh_hop) # _TODO_ tanh? h_t = controller_output[:self.rnn_hidden_dim] op_t = controller_output[self.rnn_hidden_dim:] # Compute d_t (push signal), u_t (pop signal), v_t (value vector) and o_t (network output) d_t = T.nnet.sigmoid(T.dot(op_t, self.w_op_d) + self.b_op_d)[0] u_t = T.nnet.sigmoid(T.dot(op_t, self.w_op_u) + self.b_op_u)[0] v_t = T.tanh(T.dot(op_t, self.w_op_v) + self.b_op_v) o_t = T.tanh(T.dot(op_t, self.w_op_o) + self.b_op_o) # Add new value to the stack updates[values] = T.set_subtensor(values[i], v_t) # Compute new strength previous_strength = T.switch(T.eq(i, 0), [np.float32(0)], strengths[i - 1][:i]) [_, new_strength], _ = theano.scan( fn=recurrence_strength, outputs_info=[np.float32(0), np.float32(0)], sequences=[T.arange(i + 1), T.concatenate([[np.float32(0)], previous_strength[::-1]])], non_sequences=[d_t, u_t] ) new_strength = new_strength[::-1] updates[strengths] = T.set_subtensor(strengths[i, :i + 1], new_strength) # Compute new read vector [_, r_t], _ = theano.scan( fn=recurrence_read, outputs_info=[np.float32(0), np.zeros(self.values_dim).astype(np.float32)], sequences=[new_strength[:i + 1][::-1], T.concatenate([values[:i + 1], v_t.reshape((1, self.values_dim))], axis=0)[::-1]] ) r_t = r_t[-1] return [r_t, h_t, o_t], updates # _TODO_ change the maxsize strengths = create_shared(np.zeros((100, 100)), 'strengths') values = create_shared(np.zeros((100, self.values_dim)), 'values') [r, h, o], updates = theano.scan( fn=recurrence, sequences=[T.arange(input.shape[0]), input], outputs_info=[self.r_0, self.h_0, None], non_sequences=[strengths, values] ) return [r, h, o], updates
import theano import math import pickle import theano.tensor as T import numpy as np import utils as U from theano import sparse from scipy.sparse import csr_matrix def shared_sparse(arr): data = arr.data indices = arr.indices indptr = arr.indptr shape = np.array(arr.shape) return sparse.CSR(data, indices, indptr, shape) if __name__ == "__main__": training_data = shared_sparse(csr_matrix(np.eye(100))) #training_labels = pickle.load(open('tags.train.data','r')) W = U.create_shared(U.initial_weights(71165, 26920)) out = theano.dot(training_data, W) f = theano.function(inputs=[], outputs=out) print f()
import theano import theano.tensor as T import numpy as np import utils as U from numpy_hinton import print_arr from theano.printing import Print W1 = U.create_shared(U.initial_weights(10,10)) W2 = U.create_shared(U.initial_weights(10,10)) b = U.create_shared(U.initial_weights(10)) X = T.dmatrix('X') def pair_combine(X): def step(i,inputs): length = inputs.shape[0] next_level = T.dot(inputs[T.arange(0,length-i-1)],W1) + T.dot(inputs[T.arange(1,length-i)],W2) + b next_level = next_level*(next_level > 0) #next_level = inputs[T.arange(0,length-i-1)] + inputs[T.arange(1,length-i)] #next_level = theano.printing.Print('inputs')(next_level) return T.concatenate([next_level,T.zeros_like(inputs[:length-next_level.shape[0]])]) combined,_ = theano.scan( step, sequences = [T.arange(X.shape[0])], outputs_info = [X], n_steps = X.shape[0]-1 ) return combined[-1,0], combined[0][:-1] combined, pairwise = pair_combine(X) f = theano.function( inputs = [X], outputs = [combined,pairwise] )
def __init__(self,size): super(Recurrent,self).__init__(size) self.W = U.create_shared(U.initial_weights(size,size)) self.h0 = U.create_shared(np.zeros((size,))) self.updates = [self.W]
def __init__(self, input_dim, hidden_dim, output_emb_dim, output_dim, with_batch=True, name='LSTM'): """ Initialize neural network. - input_dim: dimension of input vectors - hidden_dim: dimension of hidden vectors - output_emb_dim: dimension of output embeddings - output_dim: number of possible outputs """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_emb_dim = output_emb_dim self.output_dim = output_dim self.with_batch = with_batch self.name = name # Input gate weights self.w_xi = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xi') self.w_hi = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hi') self.w_yi = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yi') self.w_ci = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ci') # Forget gate weights self.w_xf = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xf') self.w_hf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hf') self.w_yf = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yf') self.w_cf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_cf') # Output gate weights self.w_xo = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xo') self.w_ho = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ho') self.w_yo = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yo') self.w_co = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_co') # Cell weights self.w_xc = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xc') self.w_hc = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hc') self.w_yc = create_shared(random_weights((output_emb_dim, hidden_dim)), name + '__w_yc') # Initialize the bias vectors, c_0 and h_0 to zero vectors self.b_i = create_shared(np.zeros((hidden_dim, )), name + '__b_i') self.b_f = create_shared(np.zeros((hidden_dim, )), name + '__b_f') self.b_c = create_shared(np.zeros((hidden_dim, )), name + '__b_c') self.b_o = create_shared(np.zeros((hidden_dim, )), name + '__b_o') self.c_0 = create_shared(np.zeros((hidden_dim, )), name + '__c_0') self.h_0 = create_shared(np.zeros((hidden_dim, )), name + '__h_0') # self.y_0 = create_shared(np.zeros((output_emb_dim,)), name + '__y_0') # Weights for projection to final output, and outputs embeddings self.embeddings = create_shared( random_weights((output_dim + 1, output_emb_dim)), name + '__embeddings') self.weights = create_shared(random_weights((hidden_dim, output_dim)), name + '__weights') self.bias = create_shared(random_weights((output_dim, )), name + '__bias') # Define parameters self.params = [ self.w_xi, self.w_hi, self.w_yi, self.w_ci, self.w_xf, self.w_hf, self.w_yf, self.w_cf, self.w_xo, self.w_ho, self.w_yo, self.w_co, self.w_xc, self.w_hc, self.w_yc, self.b_i, self.b_c, self.b_o, self.b_f, self.c_0, self.h_0, # self.y_0, self.embeddings, self.weights, self.bias ]
def __init__(self, input_dim, rnn_hidden_dim, rnn_output_dim, values_dim, output_dim, name='stack'): """ Initialize neural network. """ self.input_dim = input_dim self.rnn_hidden_dim = rnn_hidden_dim self.rnn_output_dim = rnn_output_dim self.values_dim = values_dim self.output_dim = output_dim self.name = name # Generate weights and bias to compute the push scalar (d_t), the pop scalar (u_t), # the value vector (v_t), and the network output (o_t) # Weights self.w_op_d = create_shared(random_weights((rnn_output_dim, 1)), name + '__w_op_d') self.w_op_u = create_shared(random_weights((rnn_output_dim, 1)), name + '__w_op_u') self.w_op_v = create_shared( random_weights((rnn_output_dim, values_dim)), name + '__w_op_v') self.w_op_o = create_shared( random_weights((rnn_output_dim, output_dim)), name + '__w_op_o') # Bias self.b_op_d = create_shared(np.zeros((1, )), name + '__b_op_d') self.b_op_u = create_shared(np.zeros((1, )), name + '__b_op_u') self.b_op_v = create_shared(np.zeros((values_dim, )), name + '__b_op_v') self.b_op_o = create_shared(np.zeros((output_dim, )), name + '__b_op_o') # RNN Controller weights self.w_xrh_hop = create_shared( random_weights((input_dim + values_dim + rnn_hidden_dim, rnn_hidden_dim + rnn_output_dim)), name + '__w_xrh_hop') self.b_xrh_hop = create_shared( np.zeros((rnn_hidden_dim + rnn_output_dim, )), name + '__b_xrh_hop') # Initial hidden states H_0 - H_t = (h_t, r_t, (v_t, s_t)) self.h_0 = create_shared(np.zeros((rnn_hidden_dim, )), name + '__h_0') self.r_0 = create_shared(np.zeros((values_dim, )), name + '__r_0') # self.v_0 = create_shared(np.zeros((values_dim,)), name + '__v_0') # self.s_0 = create_shared(np.zeros((1,)), name + '__s_0') # Define parameters self.params = [ self.w_op_d, self.w_op_u, self.w_op_v, self.w_op_o, self.b_op_d, self.b_op_u, self.b_op_v, self.b_op_o, self.w_xrh_hop, self.b_xrh_hop, self.h_0 ] # _TODO_ check this (why not put r_0, s_0, v_0)
def link(self, input): """ Propagate the input through the network and return the last hidden vector. The whole sequence is also accessible through self.h """ def recurrence_strength(j, s_tm1_i, current_sum, _, d_t, u_t): s_t_i = T.maximum(0, s_tm1_i - T.maximum(0, u_t - current_sum)) return current_sum + s_tm1_i, T.switch(T.eq(j, 0), d_t, s_t_i) def recurrence_read(s_t_i, v_t_i, current_sum, current_read): new_read = T.minimum(s_t_i, T.maximum(0, 1 - current_sum)) * v_t_i return current_sum + s_t_i, current_read + new_read def recurrence(i, x_t, r_tm1, h_tm1, strengths, values): updates = {} # Controller - compute O'_t' controller_input = T.concatenate([x_t, r_tm1, h_tm1]) controller_output = T.tanh( T.dot(controller_input, self.w_xrh_hop) + self.b_xrh_hop) # _TODO_ tanh? h_t = controller_output[:self.rnn_hidden_dim] op_t = controller_output[self.rnn_hidden_dim:] # Compute d_t (push signal), u_t (pop signal), v_t (value vector) and o_t (network output) d_t = T.nnet.sigmoid(T.dot(op_t, self.w_op_d) + self.b_op_d)[0] u_t = T.nnet.sigmoid(T.dot(op_t, self.w_op_u) + self.b_op_u)[0] v_t = T.tanh(T.dot(op_t, self.w_op_v) + self.b_op_v) o_t = T.tanh(T.dot(op_t, self.w_op_o) + self.b_op_o) # Add new value to the stack updates[values] = T.set_subtensor(values[i], v_t) # Compute new strength previous_strength = T.switch(T.eq(i, 0), [np.float32(0)], strengths[i - 1][:i]) [_, new_strength ], _ = theano.scan(fn=recurrence_strength, outputs_info=[np.float32(0), np.float32(0)], sequences=[ T.arange(i + 1), T.concatenate([[np.float32(0)], previous_strength[::-1]]) ], non_sequences=[d_t, u_t]) new_strength = new_strength[::-1] updates[strengths] = T.set_subtensor(strengths[i, :i + 1], new_strength) # Compute new read vector [_, r_t], _ = theano.scan( fn=recurrence_read, outputs_info=[ np.float32(0), np.zeros(self.values_dim).astype(np.float32) ], sequences=[ new_strength[:i + 1][::-1], T.concatenate( [values[:i + 1], v_t.reshape((1, self.values_dim))], axis=0)[::-1] ]) r_t = r_t[-1] return [r_t, h_t, o_t], updates # _TODO_ change the maxsize strengths = create_shared(np.zeros((100, 100)), 'strengths') values = create_shared(np.zeros((100, self.values_dim)), 'values') [r, h, o], updates = theano.scan(fn=recurrence, sequences=[T.arange(input.shape[0]), input], outputs_info=[self.r_0, self.h_0, None], non_sequences=[strengths, values]) return [r, h, o], updates
outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), updates = updates, givens = { X: data, Y: labels, } ) return train_model if __name__ == '__main__': print "Setting up memory..." X = T.bmatrix('X') Y = T.bvector('Y') Ws_char_to_hidden = [ U.create_shared(U.initial_weights(CHARACTERS,HIDDEN),name='yeah%d'%i) for i in xrange(CONTEXT) ] b_hidden = U.create_shared(U.initial_weights(HIDDEN)) W_hidden_to_hidden = U.create_shared(U.initial_weights(HIDDEN,HIDDEN)) W_hidden_to_predict = U.create_shared(U.initial_weights(HIDDEN,CHARACTERS)) b_predict = U.create_shared(U.initial_weights(CHARACTERS)) tunables = Ws_char_to_hidden + [ b_hidden, W_hidden_to_hidden, W_hidden_to_predict, b_predict ] print "Constructing graph..." hidden_inputs = make_hidden_inputs(X,Ws_char_to_hidden,b_hidden) hidden_outputs = make_hidden_outputs(hidden_inputs,W_hidden_to_hidden) predictions = make_predictions(hidden_outputs,W_hidden_to_predict,b_predict)
type=int, action="store", help="number of parallel jobs") args = parser.parse_args() student_name = args.user rg_name = utils.get_student_resource_group(student_name) storage_account = utils.get_student_storage_account(student_name) region = utils.get_student_region(student_name) vm_size = "Standard_E4_v3" RESIZE_OS_DISK = False OS_DISK_SIZE = 511 if args.create_shared: utils.create_shared(rg_name, region) def create_cluster_node(idx, user_pass): IP_NAME = "ip_cluster{0}".format(idx) NIC_NAME = "nic_cluster{0}".format(idx) INT_DNS_NAME = "cluster{0}".format(idx) OS_DISK_NAME = "cluster{0}_os_disk".format(idx) VM_NAME = INT_DNS_NAME IP = "10.0.1.2{0}".format(idx) if idx != 1: IP_NAME = None if args.create_aux: # create public IP
def __init__(self, input_dim, rnn_hidden_dim, rnn_output_dim, values_dim, output_dim, name='stack'): """ Initialize neural network. """ self.input_dim = input_dim self.rnn_hidden_dim = rnn_hidden_dim self.rnn_output_dim = rnn_output_dim self.values_dim = values_dim self.output_dim = output_dim self.name = name # Generate weights and bias to compute the push scalar (d_t), the pop scalar (u_t), # the value vector (v_t), and the network output (o_t) # Weights self.w_op_d = create_shared(random_weights((rnn_output_dim, 1)), name + '__w_op_d') self.w_op_u = create_shared(random_weights((rnn_output_dim, 1)), name + '__w_op_u') self.w_op_v = create_shared(random_weights((rnn_output_dim, values_dim)), name + '__w_op_v') self.w_op_o = create_shared(random_weights((rnn_output_dim, output_dim)), name + '__w_op_o') # Bias self.b_op_d = create_shared(np.zeros((1,)), name + '__b_op_d') self.b_op_u = create_shared(np.zeros((1,)), name + '__b_op_u') self.b_op_v = create_shared(np.zeros((values_dim,)), name + '__b_op_v') self.b_op_o = create_shared(np.zeros((output_dim,)), name + '__b_op_o') # RNN Controller weights self.w_xrh_hop = create_shared(random_weights((input_dim + values_dim + rnn_hidden_dim, rnn_hidden_dim + rnn_output_dim)), name + '__w_xrh_hop') self.b_xrh_hop = create_shared(np.zeros((rnn_hidden_dim + rnn_output_dim,)), name + '__b_xrh_hop') # Initial hidden states H_0 - H_t = (h_t, r_t, (v_t, s_t)) self.h_0 = create_shared(np.zeros((rnn_hidden_dim,)), name + '__h_0') self.r_0 = create_shared(np.zeros((values_dim,)), name + '__r_0') # self.v_0 = create_shared(np.zeros((values_dim,)), name + '__v_0') # self.s_0 = create_shared(np.zeros((1,)), name + '__s_0') # Define parameters self.params = [ self.w_op_d, self.w_op_u, self.w_op_v, self.w_op_o, self.b_op_d, self.b_op_u, self.b_op_v, self.b_op_o, self.w_xrh_hop, self.b_xrh_hop, self.h_0 ] # _TODO_ check this (why not put r_0, s_0, v_0)
def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.with_batch = with_batch self.name = name # Input gate weights self.w_xi = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xi') self.w_hi = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hi') self.w_ci = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ci') # Forget gate weights self.w_xf = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xf') self.w_hf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hf') self.w_cf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_cf') # Output gate weights self.w_xo = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xo') self.w_ho = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ho') self.w_co = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_co') # Cell weights self.w_xc = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xc') self.w_hc = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hc') # Initialize the bias vectors, c_0 and h_0 to zero vectors self.b_i = create_shared(np.zeros((hidden_dim, )), name + '__b_i') self.b_f = create_shared(np.zeros((hidden_dim, )), name + '__b_f') self.b_c = create_shared(np.zeros((hidden_dim, )), name + '__b_c') self.b_o = create_shared(np.zeros((hidden_dim, )), name + '__b_o') self.c_0 = create_shared(np.zeros((hidden_dim, )), name + '__c_0') self.h_0 = create_shared(np.zeros((hidden_dim, )), name + '__h_0') # Define parameters self.params = [ self.w_xi, self.w_hi, # self.w_ci, self.w_xf, self.w_hf, # self.w_cf, self.w_xo, self.w_ho, # self.w_co, self.w_xc, self.w_hc, self.b_i, self.b_c, self.b_o, self.b_f, self.c_0, self.h_0 ]
def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'): """ Initialize neural network. """ self.input_dim = input_dim self.hidden_dim = hidden_dim self.with_batch = with_batch self.name = name # Input gate weights self.w_xi = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xi') self.w_hi = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hi') self.w_ci = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ci') # Forget gate weights self.w_xf = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xf') self.w_hf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hf') self.w_cf = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_cf') # Output gate weights self.w_xo = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xo') self.w_ho = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_ho') self.w_co = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_co') # Cell weights self.w_xc = create_shared(random_weights((input_dim, hidden_dim)), name + '__w_xc') self.w_hc = create_shared(random_weights((hidden_dim, hidden_dim)), name + '__w_hc') # Initialize the bias vectors, c_0 and h_0 to zero vectors self.b_i = create_shared(np.zeros((hidden_dim,)), name + '__b_i') self.b_f = create_shared(np.zeros((hidden_dim,)), name + '__b_f') self.b_c = create_shared(np.zeros((hidden_dim,)), name + '__b_c') self.b_o = create_shared(np.zeros((hidden_dim,)), name + '__b_o') self.c_0 = create_shared(np.zeros((hidden_dim,)), name + '__c_0') self.h_0 = create_shared(np.zeros((hidden_dim,)), name + '__h_0') # Define parameters self.params = [self.w_xi, self.w_hi, # self.w_ci, self.w_xf, self.w_hf, # self.w_cf, self.w_xo, self.w_ho, # self.w_co, self.w_xc, self.w_hc, self.b_i, self.b_c, self.b_o, self.b_f, self.c_0, self.h_0]
parser = argparse.ArgumentParser() parser.add_argument("--create_shared", action="store_true", help="create shared resources") parser.add_argument("--create_aux", action="store_true", help="create aux resources, only once per script run") args = parser.parse_args() VM_SIZE = "Standard_NC6" RESIZE_OS_DISK = False OS_DISK_SIZE = 1023 if args.create_shared: utils.create_shared(RG_NAME, REGION, VNET_NAME, NSG_NAME, SUBNET_NAME) IP_NAME = "ip_ubuntugpu" NIC_NAME = "nic_ubuntugpu" INT_DNS_NAME = UBUNTUGPU_VM OS_DISK_NAME = "ubuntugpu_os_disk" IP = "10.0.1.10" if args.create_aux: # create public IP utils.create_public_ip(IP_NAME, RG_NAME) # Create network card with fixed private IP utils.create_nic_with_private_ip(NIC_NAME, RG_NAME, VNET_NAME, SUBNET_NAME, NSG_NAME, IP_NAME, INT_DNS_NAME, IP)