def main3(): rng = numpy.random.RandomState(23455) docList = ttList.TypedListType(ttList.TypedListType(TensorType(theano.config.floatX, (False, False))))("docList") docLabel = T.ivector('docLabel') layer0 = DocEmbeddingNN(docList, rng, 4) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=10, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=10) cost = layer2.negative_log_likelihood(docLabel) params = layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) f = theano.function([docList], layer2.y_pred) a = [ [ [[2, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], [[1, 2, 4, 4], [1, 2, 3, 4]] ], [ [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] ] ] print f(a) print "All finished!"
def sgd_predict(dataset=DataHome, batch_size=28): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ logistic_regression_model_pkl = open(train_model_route, "r") logistic_regression_model_state = cPickle.load(logistic_regression_model_pkl) W, b = logistic_regression_model_state datasets = load_data.load_data(dataset) test_set_x, test_set_y = datasets[2] n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### # print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of # [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10, W=W, b=b) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_results = theano.function( inputs=[index], outputs=classifier.y_pred, givens={x: test_set_x[index * batch_size : (index + 1) * batch_size]} ) test_res = [test_results(i) for i in xrange(n_test_batches)] print test_res
class MLP(object): def __init__(self, rng, input, n_in, n_hidden, n_out): # hidden layer, defined in HiddenLayer.py self.hiddenLayer = HiddenLayer(rng = rng, input = input, n_in = n_in, n_out = n_hidden, activation = T.tanh) # output layer, logistic regression self.logRegressionLayer = LogisticRegression(input = self.hiddenLayer.output, n_in = n_hidden, n_out = n_out) # Regularization of params # option 1: L1 regularization of params self.L1 = abs(self.hiddenLayer.W).sum() \ + abs(self.logRegressionLayer.W).sum() self.L2_sqr = (self.hiddenLayer.W **2).sum() \ + (self.logRegressionLayer.W **2).sum() # Define the log likelihood, errors based on component models self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood self.errors = self.logRegressionLayer.errors self.params = self.hiddenLayer.params + \ self.logRegressionLayer.params def __getstate__(self): """ Return the hidden layer and logistic regression layer that make up the MLP. """ return (self.hiddenLayer,self.logRegressionLayer) def __setstate__(self, state): """ Re-establish the hidden layer and logistic regression layer objects. """ (hiddenLayer,logRegressionLayer) = state self.hiddenLayer = hiddenLayer self.logRegressionLayer = logRegressionLayer def reconstruct_state(self,input, activation): """ Re-establish the inputs for each layer of the MLP. """ self.hiddenLayer.reconstruct_state(input,activation) self.logRegressionLayer.reconstruct_state(self.hiddenLayer.output) self.L1 = abs(self.hiddenLayer.W).sum() \ + abs(self.logRegressionLayer.W).sum() self.L2_sqr = (self.hiddenLayer.W **2).sum() \ + (self.logRegressionLayer.W **2).sum() # Define the log likelihood, errors based on component models self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood self.errors = self.logRegressionLayer.errors self.params = self.hiddenLayer.params + \ self.logRegressionLayer.params
def build_network(self,nkerns=[20, 50], batch_size=10): rng = numpy.random.RandomState(23455) layer0_input = self.x.reshape((batch_size, 1, 128, 128)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 128, 128), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 62, 62), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 29 * 29, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=23) self.cost = layer3.negative_log_likelihood(self.y) self.out_layer=layer3 '''batches = numpy.zeros((10, 128*128),dtype='float32') label=numpy.zeros((10)) print layer2.output.shape.eval({self.x:batches})''' self.feature_layer=layer2 self.params_except_3=layer2.params + layer1.params + layer0.params self.params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters self.grads = T.grad(self.cost, self.params)
def classify_lenet5(learning_rate=0.005, n_epochs=8000, image_path='D:/dev/datasets/isbi/train-input/train-input_0000.tif', paramfile='lenet0_membrane_epoch_25100.pkl.gz', nkerns=[20, 50], batch_size=1): rng = numpy.random.RandomState(23455) # allocate symbolic variables for the data index_x = T.lscalar() # index to a [mini]batch index_y = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y)
def build_model_example(batch_size, learning_rate, rng, x, y): nkerns = [64] print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = _dropout_from_layer(rng, x.reshape((batch_size, 7, 4, 7, 7)), p=0.2) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPool3dLayer(rng, input=layer0_input, image_shape=(batch_size, 7, 4, 7, 7), filter_shape=(nkerns[0], 5, 4, 5, 5), ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) # layer1 = LeNetConvPoolLayer(rng, input=layer0.output, # image_shape=(batch_size, nkerns[0], 12, 12), # filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer1_input = layer0.output.flatten(2) # layer1 = HiddenLayer(rng, input=x, n_in=4*7*7*7, layer1 = HiddenLayer(rng, input=layer1_input, n_in=nkerns[-1] * 3 ** 3, n_out=1000, activation=relu, ) # p=0.5) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer1.output, n_in=1000, n_out=1000, activation=relu) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=1000, n_out=2) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # params = layer3.params + layer0.params + layer1.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) return cost, layer3, updates
def compileFun(model_name, dataset_name, pooling_mode): print "model_name: ", model_name print "dataset_name: ", dataset_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") # docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100], pooling_mode=pooling_mode) layer1_output_num = 100 layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer1_output_num, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) cost = layer2.negative_log_likelihood(1 - layer2.y_pred) # calculate sentence sentence_score sentence_grads = T.grad(cost, layer0.sentenceResults) sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults))) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. model_path = "data/" + dataset_name + "/" + model_name + "/" + pooling_mode + ".model" loadParamsVal(model_path, params) print "Compiling computing graph." output_model = theano.function( [corpus, sentenceWordCount], [layer2.y_pred, sentence_score] ) print "Compiled." return output_model
def depickle(self,n_in = 48* 48,hidden_layers_sizes=[1000,1000,1000]): numpy_rng=numpy.random.RandomState(123) self.sigmoid_layers = [] self.rbm_layers = [] last_later = self.x cur_index= 0 theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) for param in self.params: print param.get_value().shape for index in range(self.n_layers): if index == 0: input_size= n_in layer_input = self.x else: input_size = hidden_layers_sizes[index- 1] layer_input =self.sigmoid_layers[-1].output simple_sigmoid=HiddenLayer(1, input = layer_input,n_in = input_size,n_out=1000,W = self.params[cur_index], b = self.params[cur_index+1],activation=T.nnet.sigmoid) print self.params[cur_index].get_value().shape print self.params[cur_index+1].get_value().shape cur_index+= 2 self.sigmoid_layers.append(simple_sigmoid) rbm_layer = RBM(numpy_rng=numpy_rng,input=layer_input, W=simple_sigmoid.W, n_visible=input_size, n_hidden=hidden_layers_sizes[index], hbias=simple_sigmoid.b) self.rbm_layers.append(rbm_layer) self.logLayer = LogisticRegression(input = self.sigmoid_layers[-1].output,n_in=hidden_layers_sizes[-1], n_out=7) self.logLayer.W = self.params[cur_index] self.logLayer.b = self.params[cur_index+1] self.y_pred = self.logLayer.p_y_given_x
def __init__(self, rng, input, n_in, n_hidden, n_out): # hidden layer, defined in HiddenLayer.py self.hiddenLayer = HiddenLayer(rng = rng, input = input, n_in = n_in, n_out = n_hidden, activation = T.tanh) # output layer, logistic regression self.logRegressionLayer = LogisticRegression(input = self.hiddenLayer.output, n_in = n_hidden, n_out = n_out) # Regularization of params # option 1: L1 regularization of params self.L1 = abs(self.hiddenLayer.W).sum() \ + abs(self.logRegressionLayer.W).sum() self.L2_sqr = (self.hiddenLayer.W **2).sum() \ + (self.logRegressionLayer.W **2).sum() # Define the log likelihood, errors based on component models self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood self.errors = self.logRegressionLayer.errors self.params = self.hiddenLayer.params + \ self.logRegressionLayer.params
def __init__(self, rng, input, n_in, n_hidden, n_out,discriminant_threshold): self.hiddenLayer = HiddenLayer( rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh ) self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out, discriminant_threshold = discriminant_threshold ) self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum() ) self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.logRegressionLayer.W ** 2).sum() ) self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood ) self.errors = self.logRegressionLayer.errors self.params = self.hiddenLayer.params + self.logRegressionLayer.params self.layers = [self.hiddenLayer, self.logRegressionLayer] self.ypred = self.logRegressionLayer.y_pred self.py_given_x = self.logRegressionLayer.p_y_given_x
def __init__(self, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, numpy_rng=None, theano_rng=None): self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) if numpy_rng is None: numpy_rng = numpy.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ numpy_rng: numpy.random.RandomState: numpy random number generator used to draw initial weights theano_rng: theano.tensor.shared_randomstreams.RandomStreams: Theano random generator; if None is given one is generated based on a seed drawn from `rng` n_ins: int: dimension of the input to the sDAE n_layers_sizes: list of ints: intermediate layers size, must contain at least one value n_outs: int: dimension of the output of the network corruption_levels: list of float: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.DAE_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # Symbolic variables for data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of [int] labels # The SDAE is an MLP, for which all weights of intermediate layers are shared with a different denoising autoencoders. We will first construct the # SDAE as a deep multilayer perceptron, and when constructing each sigmoidal layer we also construct a denoising autoencoder that shares weights with that layer # - During pretraining we will train these autoencoders (which will lead to changing the weights of the MLP as well) # - During finetunining we will finish training the SDAE by doing stochastic gradient descent on the MLP # Build an MLP and a DAE in parallel to each other with sharing weights. layer by layer for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output_data # hidden layer (logistic) sigmoid_layer = HiddenLayer(rng=numpy_rng, input_data=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) # add the layer to our list of layers self.params.extend(sigmoid_layer.params) # We are going to only declare that the parameters of the sigmoid_layers are parameters of the StackedDAA # the visible biases in the DAE are parameters of those DAE, but not the SDAE # DAE layer with shared weights with MLP layer DAE_layer = DAE(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) # shared weights and biases self.DAE_layers.append(DAE_layer) # Finally, Add a logistic layer at the end of the MLP self.logisticLayer = LogisticRegression(input_data=self.sigmoid_layers[-1].output_data, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logisticLayer.params) self.finetune_cost = self.logisticLayer.negative_log_likelihood(self.y) self.errors = self.logisticLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.sigmoid_layers[i - 1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) #symbolic variables for data self.x = T.matrix('x') self.y = T.ivector('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i-1] layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) #add to our list of layers self.sigmoid_layers.append(sigmoid_layer) #declare thar parameters of sigmoid layers are parameters of StackedDAA #the visible biases in the dA are parameters of those dA but not the SdA self.params.extend(sigmoid_layer.params) #construct denoising autoencoder that shared weights with this layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) #add logistic layer on top of MLP self.logLayer = LogisticRegression(input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) #function that implements fine tuning #compute cost for second phase of training, defined as negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) #compute gradients w.r.t. model parameters #symbolic variable for no. of errors made on minibatch given by self.x and #self.y self.errors = self.logLayer.errors(self.y)
def __init__(self, n_ins=1024, hidden_layers_sizes=[500, 500], n_outs=10): self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels for i in xrange(self.n_layers): # construct the sigmoidal layer if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.neg_log_hood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def change_lastlayer(self,n_ins, n_outs): self.logLayer_b = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=n_ins, n_out=n_outs ) self.params_b.pop() self.params_b.pop() self.params_b.extend(self.logLayer_b.params) self.finetune_cost_b = self.logLayer_b.negative_log_likelihood(self.y) self.errors_b = self.logLayer_b.errors(self.y)
def __init__(self,rng, batch_size=100, input_size=None, nkerns=[4,4,4], receptive_fields=((2,8),(2,8),(2,8)), poolsizes=((1,8),(1,8),(1,4)),full_hidden=16, n_out=10): """ """ self.x = T.matrix(name='x',dtype=theano.config.floatX) # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of self.batch_size = theano.shared(value=batch_size,name='batch_size')#T.lscalar('batch_size') self.layers=[] self.params=[] for i in range(len(nkerns)): receptive_field=receptive_fields[i] if i==0: featmap_size_after_downsample=input_size layeri_input = self.x.reshape((batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1])) image_shape=(batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1]) filter_shape=(nkerns[i], 1, receptive_field[0], receptive_field[1]) else: layeri_input=self.layers[i-1].output image_shape=(batch_size, nkerns[i-1], featmap_size_after_downsample[0], featmap_size_after_downsample[1]) filter_shape=(nkerns[i], nkerns[i-1], receptive_field[0], receptive_field[1]) layeri = LeNetConvPoolLayer(rng=rng, input=layeri_input, image_shape=image_shape, filter_shape=filter_shape, poolsize=poolsizes[i]) featmap_size_after_conv=get_featmap_size_after_conv(featmap_size_after_downsample,receptive_fields[i]) featmap_size_after_downsample=get_featmap_size_after_downsample(featmap_size_after_conv,poolsizes[i]) self.layers.append(layeri) self.params.extend(layeri.params) # fully connected layer print 'going to fully connected layer' layer_full_input = self.layers[-1].output.flatten(2) # construct a fully-connected sigmoidal layer layer_full = HiddenLayer(rng=rng, input=layer_full_input, n_in=nkerns[-1] * featmap_size_after_downsample[0] * featmap_size_after_downsample[1], n_out=full_hidden, activation=T.tanh) self.layers.append(layer_full) self.params.extend(layer_full.params) # classify the values of the fully-connected sigmoidal layer print 'going to output layer' self.logRegressionLayer = LogisticRegression(input=self.layers[-1].output, n_in=full_hidden, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # the cost we minimize during training is the NLL of the model self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(self.y) self.cost = self.logRegressionLayer.negative_log_likelihood(self.y) self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred
def load_trained_model(): global if_load_trained_model global train_model_route global classifier global validate_model global validate_results if_load_trained_model = 1 print "loading trained model for the first time" trained_model_pkl = open(train_model_route, 'r') trained_model_state_list = cPickle.load(trained_model_pkl) trained_model_state_array = numpy.load(trained_model_pkl) classifier_state = trained_model_state_array[0] classifier = LogisticRegression(input=x, n_in=in_shape, n_out=layer0_output_shape , W=classifier_state[0], b=classifier_state[1]) # definition for theano.function validate_model = theano.function(inputs=[x, y], outputs=classifier.errors(y)) validate_results = theano.function(inputs=[x], outputs=classifier.y_pred)
def reconstruct_loglayer(self, n_outs = 10): """ Reconstruct a logistic layer on top of a previously trained SdA """ # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.dA_layers[-1].output, n_in=self.dA_layers[-1].n_hidden, n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def __init__(self, rng, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will # translate into a TanhLayer connected to the LogisticRegression # layer; this can be replaced by a SigmoidalLayer, or a layer # implementing any other nonlinearity self.hiddenLayer = HiddenLayer( rng=rng, n_in=n_in, n_out=n_hidden, activation=numpy.tanh) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( n_in=n_hidden, n_out=n_out) ## the parameters of the model are the parameters of the two layer it is ## made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params
def Buildnet(params, nkerns=[20, 50], batch_size=500): rng = numpy.random.RandomState(23455) datasets = load_data(0) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=3) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) f = theano.function( inputs=[index], outputs=[layer2.output, layer3.y_pred, y], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # numepoch = len(params) layer3.W.set_value(params[-1][0]) layer3.b.set_value(params[-1][1]) layer2.W.set_value(params[-1][2]) layer2.b.set_value(params[-1][3]) layer1.W.set_value(params[-1][4]) layer1.b.set_value(params[-1][5]) layer0.W.set_value(params[-1][6]) layer0.b.set_value(params[-1][7]) outputvectors = numpy.zeros((10000, 500)) labels = numpy.zeros((10000, 1)) reallabels = numpy.zeros((10000, 1)) for minibatch_index in xrange(n_test_batches): vector, label, reallabel = f(minibatch_index) outputvectors[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] = vector labels[minibatch_index * batch_size:(minibatch_index + 1) * batch_size, 0] = label reallabels[minibatch_index * batch_size:(minibatch_index + 1) * batch_size, 0] = reallabel return [outputvectors, labels, reallabels]
def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function self.hiddenLayer = HiddenLayer(rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out, rng=rng) # end-snippet-2 start-snippet-3 # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = (abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = ((self.hiddenLayer.W**2).sum() + (self.logRegressionLayer.W**2).sum()) # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood) # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params # end-snippet-3 # keep track of model input self.input = input
def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function self.hiddenLayer = HiddenLayer(rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = abs(self.hiddenLayer.W).sum() \ + abs(self.logRegressionLayer.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \ + (self.logRegressionLayer.W ** 2).sum() # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors self.pp_errors = self.logRegressionLayer.pp_errors # the parameters of the model are the parameters of the two layer it is # made out of # compute vector of class-membership probabilities in symbolic form self.p_y_given_x = self.logRegressionLayer.p_y_given_x self.y_pred = T.argmax(self.p_y_given_x, axis=1) self.max_prob = self.p_y_given_x[T.arange(input.shape[0]),self.y_pred] self.classify = theano.function(inputs=[input], outputs=[self.y_pred, self.max_prob]) self.params = self.hiddenLayer.params + self.logRegressionLayer.params
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, filter_size=[3,5], maxSentLen=100, hidden_size=[300,300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id=load_il6_with_BBN(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence reliefweb_all_sentences, reliefweb_all_masks, reliefweb_all_labels, word2id=load_reliefweb_as_multilabel_dataset(maxSentLen,word2id) reliefweb_all_sentences_train=np.asarray(reliefweb_all_sentences[0], dtype='int32') reliefweb_all_masks_train=np.asarray(reliefweb_all_masks[0], dtype=theano.config.floatX) reliefweb_all_labels_train=np.asarray(reliefweb_all_labels[0], dtype='int32') reliefweb_all_sentences_dev=np.asarray(reliefweb_all_sentences[1], dtype='int32') reliefweb_all_masks_dev=np.asarray(reliefweb_all_masks[1], dtype=theano.config.floatX) reliefweb_all_labels_dev=np.asarray(reliefweb_all_labels[1], dtype='int32') reliefweb_all_sentences_test=np.asarray(reliefweb_all_sentences[2], dtype='int32') reliefweb_all_masks_test=np.asarray(reliefweb_all_masks[2], dtype=theano.config.floatX) reliefweb_all_labels_test=np.asarray(reliefweb_all_labels[2], dtype='int32') train_sents=np.asarray(all_sentences[0], dtype='int32') train_masks=np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels=np.asarray(all_labels[0], dtype='int32') train_sents = np.concatenate((train_sents,reliefweb_all_sentences_train,reliefweb_all_sentences_dev,reliefweb_all_sentences_test),axis=0) train_masks = np.concatenate((train_masks,reliefweb_all_masks_train,reliefweb_all_masks_dev,reliefweb_all_masks_test),axis=0) train_labels = np.concatenate((train_labels,reliefweb_all_labels_train,reliefweb_all_labels_dev,reliefweb_all_labels_test),axis=0) train_size=len(train_labels) dev_sents=np.asarray(all_sentences[1], dtype='int32') dev_masks=np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels=np.asarray(all_labels[1], dtype='int32') dev_size=len(dev_labels) test_sents=np.asarray(all_sentences[2], dtype='int32') test_masks=np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels=np.asarray(all_labels[2], dtype='int32') test_size=len(test_labels) vocab_size= len(word2id)+1 # add one zero pad index rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_fasttext_word2vec_given_file([emb_root+'IL6-cca-wiki-lorelei-d40.eng.vec',emb_root+'IL6-cca-wiki-lorelei-d40.IL6.vec'], 40) rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix=T.imatrix('sents_id_matrix') sents_mask=T.fmatrix('sents_mask') labels=T.imatrix('labels') #batch*12 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input=embeddings[sents_id_matrix.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input*sents_mask.dimshuffle(0,'x',1),axis=2) # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x') conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) NN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_model = Conv_with_Mask(rng, input_tensor3=common_input, mask_matrix = sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input, mask_matrix = sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size LR_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1) LR_input_size = hidden_size[0]*2+emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para(rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) # loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings]+NN_para+LR_para # put all model parameters together cost=loss+1e-4*((conv_W**2).sum()+(conv_W2**2).sum()) updates = Gradient_Cost_Para(cost,params, learning_rate) ''' testing ''' binarize_prob = T.where(score_matrix > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] # max_acc_dev=0.0 max_meanf1_test=0.0 max_weightf1_test=0.0 train_indices = range(train_size) cost_i=0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu=0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%20==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() error_sum=0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels=test_model( test_sents[test_batch_id:test_batch_id+batch_size], test_masks[test_batch_id:test_batch_id+batch_size]) gold_labels = test_labels[test_batch_id:test_batch_id+batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test=test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test=test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 # parameterize training data set dimensions n_in = train_set_x.get_value(borrow=True).shape[1] n_out = max(train_set_y.eval()) - min(train_set_y.eval()) + 1 # print n_in, n_out classifier = LogisticRegression(input=x, n_in=n_in, n_out=n_out) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) #### change learning rate ### delta = numpy.random.gamma(1) learning_rate *= delta #### end ### # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) '''print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) )''' # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) '''print( ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100. ) )''' print test_score * 100. if patience <= iter: done_looping = True break end_time = time.clock() '''print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time))''' print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, rng, n_in=784, n_hidden=[500, 500], n_out=10, lambda_reg=0.001, alpha_reg=0.001): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_in: int :param n_in: dimension of the input to the DBN :type n_hidden: list of ints :param n_hidden: intermediate layers size, must contain at least one value :type n_out: int :param n_out: dimension of the output of the network :type lambda_reg: float :param lambda_reg: paramter to control the sparsity of weights by l_1 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ). Thus, the larger lambda_reg is, the sparser the weights are. :type alpha_reg: float :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ), Thus, the smaller alpha_reg is, the smoother the weights are. """ self.hidden_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(n_hidden) assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') # the data, each row is a sample self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_in else: input_size = n_hidden[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.hidden_layers[-1].output sigmoid_layer = HiddenLayer(rng=rng, input=layer_input, n_in=input_size, n_out=n_hidden[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.hidden_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=rng, theano_rng=None, input=layer_input, n_visible=input_size, n_hidden=n_hidden[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if self.n_layers > 0: self.logRegressionLayer = LogisticRegression( input=self.hidden_layers[-1].output, n_in=n_hidden[-1], n_out=n_out) else: self.logRegressionLayer = LogisticRegression(input=self.x, n_in=input_size, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # regularization L1s = [] L2_sqrs = [] for i in range(self.n_layers): L1s.append(abs(self.hidden_layers[i].W).sum()) L2_sqrs.append((self.hidden_layers[i].W**2).sum()) L1s.append(abs(self.logRegressionLayer.W).sum()) L2_sqrs.append((self.logRegressionLayer.W**2).sum()) self.L1 = T.sum(L1s) self.L2_sqr = T.sum(L2_sqrs) # compute the cost for second phase of training, # defined as the negative log likelihood self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood( self.y) self.cost=self.negative_log_likelihood + \ lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr + alpha_reg*self.L1) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred def pretraining_functions(self, train_set_x, batch_size, persistent_k=15): ''' Build the symbolic pretraining functions to update the parameter in one iteration. ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('learning_rate') # learning rate to use # number of batches #n_batches = int(math.ceil(train_set_x.get_value(borrow=True).shape[0] / batch_size)) # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm_layer in self.rbm_layers: # get the cost and the updates list cost, updates = rbm_layer.get_cost_updates(learning_rate, persistent=None, k=persistent_k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, train_set_x, train_set_y, valid_set_x, valid_set_y, batch_size, learning_rate_shared): ''' Build the symbolic finetuning functions to update the parameters in one iteration. Validation function is also defined. ''' # compute number of minibatches for training, validation and testing n_valid_batches = int( math.ceil( valid_set_x.get_value(borrow=True).shape[0] / batch_size)) index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate_shared)) train_fn = theano.function( inputs=[index], outputs=self.cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }, name='train') # test_score_i = theano.function([index], self.errors, # givens={ # self.x: test_set_x[index * batch_size: # (index + 1) * batch_size], # self.y: test_set_y[index * batch_size: # (index + 1) * batch_size]}, # name='test') valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }, name='valid') # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set # def test_score(): # return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score def build_test_function(self, test_set_x, batch_size): """ Build the symbolic test function. """ n_test_batches = int( math.ceil(test_set_x.get_value(borrow=True).shape[0] / batch_size)) index = T.lscalar('index') # index to a [mini]batch test_score_i = theano.function( [index], self.y_pred, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size] }, name='test') # Create a function that scans the entire test set def test_score(): y_pred = [] for i in xrange(n_test_batches): y_pred.extend(test_score_i(i)) return y_pred return test_score def get_params(self): return copy.deepcopy(self.params) def set_params(self, given_params): self.params = given_params def print_params(self): for param in self.params: print param.get_value(borrow=True) def save_params(self, filename): f = open(filename, 'w') # remove existing file f.close() f = open(filename, 'a') for param in self.params: pickle.dump(param.get_value(borrow=True), f) f.close()
def __init__(self, ds=None, nkerns=[32, 48], batch_size=100, normalized_width=0, distortion=0, params=[None, None,None, None,None, None,None, None]): #layers for train: #layer3_W, layer3_b on L1-Convolutional Layer with 20 by 26x26 #layer2_W, layer2_b on L2-Convolutional Layer with 40 by 9x9 #layer1_W, layer1_b on HiddenLayer - fully-connected #layer0_W, layer0_b on LogisticRegression - output layer layer3_W, layer3_b, layer2_W, layer2_b, layer1_W, layer1_b, layer0_W, layer0_b = params rng = numpy.random.RandomState(23455) #dataset by param - the load data was executed before on function call train_set_x, train_set_y = ds[0] valid_set_x, valid_set_y = ds[1] test_set_x, test_set_y = ds[2] # compute number of minibatches for training, validation and testing self.n_train_batches = train_set_x.get_value(borrow=True).shape[0] self.n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] self.n_test_batches = test_set_x.get_value(borrow=True).shape[0] self.n_train_batches /= batch_size self.n_valid_batches /= batch_size self.n_test_batches /= batch_size index = T.lscalar() learning_rate = T.fscalar() # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the dnn column' layer0_input = x.reshape((batch_size, 1, 29, 29)) # Construct the first convolutional pooling layer layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 29, 29), filter_shape=(nkerns[0], 1, 4, 4), poolsize=(2, 2), W=layer0_W, b=layer0_b ) # Construct the second convolutional pooling layer layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 13, 13), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(3, 3), W=layer1_W, b=layer1_b ) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 3 * 3, n_out=150, W=layer2_W, b=layer2_b, activation=T.tanh ) # contruct the output layer # classification of values from fully-connected sigmoidal layer layer3 = LogisticRegression( input=layer2.output, n_in=150, n_out=10, W=layer3_W, b=layer3_b ) cost = layer3.negative_log_likelihood(y) # compute the mistakes that are made by the model self.test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) # compute probabilities of all output classes - on validation set self.valid_output_batch = theano.function( [index], layer3.p_y_given_x, givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size] } ) # compute probabilities of all output classes self.test_output_batch = theano.function( [index], layer3.p_y_given_x, givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size] } ) # compute the mistakes on validate set that are made by model self.validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) self.params = layer3.params + layer2.params + layer1.params + layer0.params # save the params to use for reference on test set # those will be use on test_mcdnn self.column_params = [nkerns, batch_size, normalized_width, distortion] grads = T.grad(cost, self.params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(self.params, grads) ] # train the model self.train_model = theano.function( [index, learning_rate], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } )
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset="mnist.pkl.gz", nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(4676148) ########################## # OUR DATA LOADING ########################## train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] width = 300 height = 200 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print "... building the model" # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, width, height)) # CHANGED # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) # so then for us: # height-pool+1, so 300-5+1, 200-5+1 = 296,196 # and then / poolsize, so we increase it to 4, and 74,49 layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, width, height), # CHANGED filter_shape=(nkerns[0], 1, 5, 5), poolsize=(4, 4), # CHANGED ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) # math for us: # -6+1 en -6+1 gives us 70, 44 # / 2 = gives us 16, 11 layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 74, 49), # CHANGED filter_shape=(nkerns[1], nkerns[0], 6, 6), # CHANGED +1 poolsize=(4, 4), # CHANGED ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 16 * 11, n_out=500, activation=T.tanh # CHANGED # CHANGE? ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=4544) # CHANGED # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size : (index + 1) * batch_size], y: test_set_y[index * batch_size : (index + 1) * batch_size], }, ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size : (index + 1) * batch_size], y: valid_set_y[index * batch_size : (index + 1) * batch_size], }, ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size : (index + 1) * batch_size], y: train_set_y[index * batch_size : (index + 1) * batch_size], }, ) # end-snippet-1 ############### # TRAIN MODEL # ############### print "... training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0.0 start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print "training @ iter = ", iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print ( "epoch %i, minibatch %i/%i, validation error %f %%" % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print ( (" epoch %i, minibatch %i/%i, test error of " "best model %f %%") % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0) ) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print ("Optimization complete.") print ( "Best validation score of %f %% obtained at iteration %i, " "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0) ) print >> sys.stderr, ( "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0) )
def __init__(self, numpy_rng=None, theano_rng=None, hidden_layers_sizes=[10], finetune_lr=1, pretraining_epochs=30, pretrain_lr=.1, k=1, training_epochs=500, pickle_dataset='numerai.pkl.gz', batch_size=100, L1=0.0000, L2=0.0000, activation=T.nnet.sigmoid, patience=100, patience_increase=20, rand_seed=8675309, verbose=False, enforce_train_supremacy=True): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.rand_seed = rand_seed if not numpy_rng: self.numpy_rng = numpy.random.RandomState(self.rand_seed) if not theano_rng: self.theano_rng = MRG_RandomStreams(self.rand_seed) self.hidden_layers_sizes = hidden_layers_sizes self.finetune_lr = finetune_lr self.pretraining_epochs = pretraining_epochs self.pretrain_lr = pretrain_lr self.k = k self.training_epochs = training_epochs self.pickle_dataset = pickle_dataset self.batch_size = batch_size self.L1 = L1 self.L2 = L2 self.activation = activation self.patience = patience self.patience_increase = patience_increase self.verbose = verbose self.enforce_train_supremacy = enforce_train_supremacy self.train_set_x, self.train_set_y, self.valid_set_x, self.valid_set_y, self.test_set_x, self.test_set_y, self.score_set_x = self.load_dataset( self.pickle_dataset) assert self.n_layers > 0 self.n_outs = int(max(self.train_set_y.eval()) + 1) self.n_ins = int(self.train_set_x.get_value(borrow=True).shape[1]) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = self.n_ins else: input_size = self.hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=self.numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation=self.activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=self.numpy_rng, theano_rng=self.theano_rng, input=layer_input, n_visible=input_size, n_hidden=self.hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) self.params.extend(self.logLayer.params) # L1 norm ; one regularization option is to enforce L1 norm to # be small L1_cost = 0 L2_cost = 0 for layer in self.sigmoid_layers: L1_cost += abs(layer.W).sum() L2_cost += abs(layer.W**2).sum() L1_cost += abs(self.logLayer.W).sum() L2_cost += abs(self.logLayer.W**2).sum() # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood( self.y) + (self.L1 * L1_cost) + (self.L2 * L2_cost) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
class DBN(object): """Deep Belief Network ADAPTED FROM BELOW BY RYAN SCHORK A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng=None, theano_rng=None, hidden_layers_sizes=[10], finetune_lr=1, pretraining_epochs=30, pretrain_lr=.1, k=1, training_epochs=500, pickle_dataset='numerai.pkl.gz', batch_size=100, L1=0.0000, L2=0.0000, activation=T.nnet.sigmoid, patience=100, patience_increase=20, rand_seed=8675309, verbose=False, enforce_train_supremacy=True): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.rand_seed = rand_seed if not numpy_rng: self.numpy_rng = numpy.random.RandomState(self.rand_seed) if not theano_rng: self.theano_rng = MRG_RandomStreams(self.rand_seed) self.hidden_layers_sizes = hidden_layers_sizes self.finetune_lr = finetune_lr self.pretraining_epochs = pretraining_epochs self.pretrain_lr = pretrain_lr self.k = k self.training_epochs = training_epochs self.pickle_dataset = pickle_dataset self.batch_size = batch_size self.L1 = L1 self.L2 = L2 self.activation = activation self.patience = patience self.patience_increase = patience_increase self.verbose = verbose self.enforce_train_supremacy = enforce_train_supremacy self.train_set_x, self.train_set_y, self.valid_set_x, self.valid_set_y, self.test_set_x, self.test_set_y, self.score_set_x = self.load_dataset( self.pickle_dataset) assert self.n_layers > 0 self.n_outs = int(max(self.train_set_y.eval()) + 1) self.n_ins = int(self.train_set_x.get_value(borrow=True).shape[1]) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = self.n_ins else: input_size = self.hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=self.numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], activation=self.activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=self.numpy_rng, theano_rng=self.theano_rng, input=layer_input, n_visible=input_size, n_hidden=self.hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) self.params.extend(self.logLayer.params) # L1 norm ; one regularization option is to enforce L1 norm to # be small L1_cost = 0 L2_cost = 0 for layer in self.sigmoid_layers: L1_cost += abs(layer.W).sum() L2_cost += abs(layer.W**2).sum() L1_cost += abs(self.logLayer.W).sum() L2_cost += abs(self.logLayer.W**2).sum() # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood( self.y) + (self.L1 * L1_cost) + (self.L2 * L2_cost) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def load_dataset(self, dataset): # Load the dataset import gzip import cPickle f = gzip.open(dataset, 'rb') train_set, valid_set, test_set, score_set = cPickle.load(f) f.close() #train_set, valid_set, test_set format: tuple(input, target) #input is an numpy.ndarray of 2 dimensions (a matrix) #witch row's correspond to an example. target is a #numpy.ndarray of 1 dimensions (vector)) that have the same length as #the number of rows in the input. It should give the target #target to the example with the same index in the input. def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32') test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) score_set_x = theano.shared(numpy.asarray(score_set, dtype=theano.config.floatX), borrow=True) return [ train_set_x, train_set_y, valid_set_x, valid_set_y, test_set_x, test_set_y, score_set_x ] def pretraining_functions(self): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = self.train_set_x.get_value( borrow=True).shape[0] / self.batch_size # begining of a batch, given `index` batch_begin = index * self.batch_size # ending of a batch given `index` batch_end = batch_begin + self.batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=self.k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: self.train_set_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' # compute number of minibatches for training, validation and testing n_valid_batches = self.valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= self.batch_size n_test_batches = self.test_set_x.get_value(borrow=True).shape[0] n_test_batches /= self.batch_size num_train_batches = self.train_set_x.get_value(borrow=True).shape[0] num_train_batches /= self.batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * self.finetune_lr)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: self.train_set_x[index * self.batch_size:(index + 1) * self.batch_size], self.y: self.train_set_y[index * self.batch_size:(index + 1) * self.batch_size] }) train_score_i = theano.function( [index], self.errors, givens={ self.x: self.train_set_x[index * self.batch_size:(index + 1) * self.batch_size], self.y: self.train_set_y[index * self.batch_size:(index + 1) * self.batch_size] }) test_score_i = theano.function( [index], self.errors, givens={ self.x: self.test_set_x[index * self.batch_size:(index + 1) * self.batch_size], self.y: self.test_set_y[index * self.batch_size:(index + 1) * self.batch_size] }) valid_score_i = theano.function( [index], self.errors, givens={ self.x: self.valid_set_x[index * self.batch_size:(index + 1) * self.batch_size], self.y: self.valid_set_y[index * self.batch_size:(index + 1) * self.batch_size] }) score_score_raw = theano.function([], self.logLayer.p_y_given_x, givens={self.x: self.score_set_x}) score_score_outcome = theano.function( [], self.logLayer.y_pred, givens={self.x: self.score_set_x}) score_train_raw = theano.function([], self.logLayer.p_y_given_x, givens={self.x: self.train_set_x}) score_train_outcome = theano.function( [], self.logLayer.y_pred, givens={self.x: self.train_set_x}) # Create a function that scans the entire training set def train_score(): return [train_score_i(i) for i in xrange(num_train_batches)] # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] def make_predictions(): return score_score_raw(), score_score_outcome(), score_train_raw( ), score_train_outcome() return train_fn, train_score, valid_score, test_score, make_predictions def save_predictions_raw(self): if type(self.raw_predictions) != numpy.ndarray: print 'Train the model before saving predictions!!' else: print 'saving file ' + 'DBN_' + self.pickle_dataset.strip( '.gz').strip('.pkl') + '_' + str( self.rand_seed) + '_' + 'pred.csv' numpy.savetxt( 'DBN_' + self.pickle_dataset.strip('.gz').strip('.pkl') + '_' + str(self.rand_seed) + '_' + 'pred_raw.csv', self.raw_predictions, delimiter=",") def save_predictions_outcome(self): if type(self.outcome_predictions) != numpy.ndarray: print 'Train the model before saving predictions!!' else: print 'saving file ' + 'DBN_' + self.pickle_dataset.strip( '.gz').strip('.pkl') + '_' + str( self.rand_seed) + '_' + 'pred.csv' numpy.savetxt( 'DBN_' + self.pickle_dataset.strip('.gz').strip('.pkl') + '_' + str(self.rand_seed) + '_' + 'pred_outcome.csv', self.outcome_predictions, delimiter=",") def train(self): """ Demonstrates how to train and test a Deep Belief Network. This is demonstrated on MNIST. :type finetune_lr: float :param finetune_lr: learning rate used in the finetune stage :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type k: int :param k: number of Gibbs steps in CD/PCD :type training_epochs: int :param training_epochs: maximal number of iterations ot run the optimizer :type dataset: string :param dataset: path the the pickled dataset :type batch_size: int :param batch_size: the size of a minibatch """ # compute number of minibatches for training, validation and testing n_train_batches = self.train_set_x.get_value( borrow=True).shape[0] / self.batch_size # numpy random generator print '... building the model' # construct the Deep Belief Network # start-snippet-2 ######################### # PRETRAINING THE MODEL # ######################### print '... getting the pretraining functions' pretraining_fns = self.pretraining_functions() print '... pre-training the model' start_time = timeit.default_timer() ## Pre-train layer-wise for i in xrange(self.n_layers): # go through pretraining epochs for epoch in xrange(self.pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, lr=self.pretrain_lr)) if self.verbose: print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) end_time = timeit.default_timer() # end-snippet-2 print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model print '... getting the finetuning functions' train_fn, train_model, validate_model, test_model, score_model = self.build_finetune_functions( ) print '... finetuning the model' # early-stopping parameters patience = self.patience * n_train_batches # look as this many examples regardless patience_increase = float( self.patience_increase) # wait this much longer when a new best is # found improvement_threshold = 0.9995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf start_time = timeit.default_timer() done_looping = False epoch = 0 self.raw_predictions = None self.outcome_predictions = None self.raw_predictions_train = None self.outcome_predictions_train = None while (epoch < self.training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: train_losses = train_model() this_training_loss = numpy.mean(train_losses) test_losses = test_model() this_testing_loss = numpy.mean(test_losses) validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) if self.verbose: print( 'epoch {}, minibatch {}/{}, training error {:.2f} %, validation error {:.2f} %, testing error {:.2f} %' .format(epoch, minibatch_index + 1, n_train_batches, this_training_loss * 100., this_validation_loss * 100., this_testing_loss * 100.)) # if we got the best validation score until now train_gate = True valid_gate = ( best_validation_loss - this_validation_loss > 0.01) or ((abs(this_validation_loss - best_validation_loss) < .0000001) and (this_training_loss < corr_training_loss)) if self.enforce_train_supremacy: train_gate = this_training_loss < this_validation_loss if valid_gate and train_gate: #improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss): patience = max(patience, iter * patience_increase) # save best validation score and iteration number corr_testing_loss = this_testing_loss best_validation_loss = this_validation_loss corr_training_loss = this_training_loss best_epoch = epoch self.raw_predictions, self.outcome_predictions, self.raw_predictions_train, self.outcome_predictions_train = score_model( ) best_iter = iter # for element in dbn.params: # print element.get_value() print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, corr_testing_loss * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete with best validation score of %.2f %%, ' 'obtained at epoch %i iteration %i, ' 'with test performance %.2f %% ' 'with training performance %.2f %% ') % (best_validation_loss * 100., best_epoch, best_iter + 1, corr_testing_loss * 100., corr_training_loss * 100.)) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
class SdA(object): """Stacked denoising auto-encoder class (SdA) A stacked denoising autoencoder model is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the SdA, and the hidden layer of the last dA represents the output. Note that after pretraining, the SdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size): ''' Generates a list of functions, each of them implementing one step in trainnig the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a [mini]batch :type learning_rate: float :param learning_rate: learning rate used during training for any of the dA layers ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: # get the cost and the updates list cost, updates = dA.get_cost_updates(corruption_level, learning_rate) # compile the theano function fn = theano.function( inputs=[ index, theano.Param(corruption_level, default=0.2), theano.Param(learning_rate, default=0.1) ], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [(param, param - gparam * learning_rate) for param, gparam in zip(self.params, gparams)] train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }, name='train') test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }, name='test') valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }, name='valid') # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
def __init__(self,numpy_rng,theano_rng=None,n_ins=784, hidden_layers_sizes=[500,500],n_outs=10, corruption_levels=[0.1,0.1]): """ 该类可以构造可变层数的网络 numpy_rng:numpy.random.RandomState 用于初始化权重的随机数 theano_rng: theano.tensor.shared_randomstreams.RandomStreams Theano随机生成数,如果,默认值为None, 则是由'rng' 生成的随机种子 n_ins: int SdA输入的维度 hidden_layers_sizes: lists of ints 中间层的层数列表,最少一个元素 n_out: int 网路输出量的维度 corruption_levels: list of float 每一层的corruption level """ self.sigmoid_layers=[] self.dA_layers=[] self.params=[] self.n_layers=len(hidden_layers_sizes) assert self.n_layers>0 #设定隐层数量大于0 if not theano_rng: theano_rng=RandomStreams(numpy_rng.randint(2**30)) #设定符号变量 self.x=T.matrix('x') #栅格化的图像数据 self.y=T.ivector('y') #由[int]型标签组成的一维向量 #SdA是一个MLP,降噪自编码器共享中间层的权重向量。 #首先将SdA构造为深层多感知器,然后构造每个sigmoid层。 #同时,该层的降噪自编码器也会共享权重。 #预训练过程是训练这些自编码器(同时也会改变多感知器的权重) #在微调过程,通过在MLP上采用随机梯度下降法完成SdA的训练 #构造sigmoid层 for i in xrange(self.n_layers): #输入量的大小是下层隐层单元数量(本层不是第一层) #输入量的大小是输入量的大小(本层是第一层) if i==0: input_size=n_ins else: input_size=hidden_layers_sizes[i-1] #本层的输入是下层隐层的激活(本层不是第一层); #本层的输入是SdA的输入(本层是第一层) if i==0: layer_input=self.x else: layer_input=self.sigmoid_layers[-1].output #定义sigmoid层 sigmoid_layer=HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) #将sigmoid层添加到层列表 self.sigmoid_layers.append(sigmoid_layer) #这是个哲学问题... #但是我们只想说sigmoid_layers的参数就是 #SdA的参数 #dA中可视偏置是dA的参数,而不是SdA的参数 self.params.extend(sigmoid_layer.params) #构造降噪自编码器与该层共享权重 dA_layer=dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b ) self.dA_layers.append(dA_layer) #在MLP顶部加上losgistic层 self.logLayer=LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1],n_out=n_outs) self.params.extend(self.logLayer.params) #建立函数,执行一步微调 #定义第二步训练的代价:负log函数 self.finetune_cost=self.logLayer.negative_log_likelihood(self.y) #分别对模型中参数计算梯度 #给定self.x和self.y,定义每个minibatch上的误差的符号变量 self.errors=self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train( sent_len, claim_len, cand_size) train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev( sent_len, claim_len, cand_size, word2id) test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) dev_sents, dev_sent_masks, dev_sent_labels, dev_claims, dev_claim_mask, dev_sent_names, dev_ground_names, dev_labels, word2id = load_fever_test( sent_len, claim_len, cand_size, word2id) dev_3th_sents, dev_3th_sent_masks, dev_3th_sent_labels, dev_3th_claims, dev_3th_claim_mask, dev_3th_labels, word2id = load_fever_test_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) train_sents = np.asarray(train_sents, dtype='int32') train_3th_sents = np.asarray(train_3th_sents, dtype='int32') joint_train_sents = np.concatenate((train_sents, train_3th_sents)) test_sents = np.asarray(test_sents, dtype='int32') test_3th_sents = np.asarray(test_3th_sents, dtype='int32') joint_test_sents = np.concatenate((test_sents, test_3th_sents)) dev_sents = np.asarray(dev_sents, dtype='int32') dev_3th_sents = np.asarray(dev_3th_sents, dtype='int32') joint_dev_sents = np.concatenate((dev_sents, dev_3th_sents)) train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX) train_3th_sent_masks = np.asarray(train_3th_sent_masks, dtype=theano.config.floatX) joint_train_sent_masks = np.concatenate( (train_sent_masks, train_3th_sent_masks)) test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX) test_3th_sent_masks = np.asarray(test_3th_sent_masks, dtype=theano.config.floatX) joint_test_sent_masks = np.concatenate( (test_sent_masks, test_3th_sent_masks)) dev_sent_masks = np.asarray(dev_sent_masks, dtype=theano.config.floatX) dev_3th_sent_masks = np.asarray(dev_3th_sent_masks, dtype=theano.config.floatX) joint_dev_sent_masks = np.concatenate((dev_sent_masks, dev_3th_sent_masks)) train_sent_labels = np.asarray(train_sent_labels, dtype='int32') train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32') joint_train_sent_labels = np.concatenate( (train_sent_labels, train_3th_sent_labels)) test_sent_labels = np.asarray(test_sent_labels, dtype='int32') test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32') joint_test_sent_labels = np.concatenate( (test_sent_labels, test_3th_sent_labels)) dev_sent_labels = np.asarray(dev_sent_labels, dtype='int32') dev_3th_sent_labels = np.asarray(dev_3th_sent_labels, dtype='int32') joint_dev_sent_labels = np.concatenate( (dev_sent_labels, dev_3th_sent_labels)) train_claims = np.asarray(train_claims, dtype='int32') train_3th_claims = np.asarray(train_3th_claims, dtype='int32') joint_train_claims = np.concatenate((train_claims, train_3th_claims)) test_claims = np.asarray(test_claims, dtype='int32') test_3th_claims = np.asarray(test_3th_claims, dtype='int32') joint_test_claims = np.concatenate((test_claims, test_3th_claims)) dev_claims = np.asarray(dev_claims, dtype='int32') dev_3th_claims = np.asarray(dev_3th_claims, dtype='int32') joint_dev_claims = np.concatenate((dev_claims, dev_3th_claims)) train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX) train_3th_claim_mask = np.asarray(train_3th_claim_mask, dtype=theano.config.floatX) joint_train_claim_mask = np.concatenate( (train_claim_mask, train_3th_claim_mask)) test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX) test_3th_claim_mask = np.asarray(test_3th_claim_mask, dtype=theano.config.floatX) joint_test_claim_mask = np.concatenate( (test_claim_mask, test_3th_claim_mask)) dev_claim_mask = np.asarray(dev_claim_mask, dtype=theano.config.floatX) dev_3th_claim_mask = np.asarray(dev_3th_claim_mask, dtype=theano.config.floatX) joint_dev_claim_mask = np.concatenate((dev_claim_mask, dev_3th_claim_mask)) train_labels = np.asarray(train_labels, dtype='int32') train_3th_labels = np.asarray(train_3th_labels, dtype='int32') joint_train_labels = np.concatenate((train_labels, train_3th_labels)) test_labels = np.asarray(test_labels, dtype='int32') test_3th_labels = np.asarray(test_3th_labels, dtype='int32') joint_test_labels = np.concatenate((test_labels, test_3th_labels)) dev_labels = np.asarray(dev_labels, dtype='int32') dev_3th_labels = np.asarray(dev_3th_labels, dtype='int32') joint_dev_labels = np.concatenate((dev_labels, dev_3th_labels)) joint_train_size = len(joint_train_claims) joint_test_size = len(joint_test_claims) joint_dev_size = len(joint_dev_claims) train_size = len(train_claims) test_size = len(test_claims) dev_size = len(dev_claims) test_3th_size = len(test_3th_claims) dev_3th_size = len(dev_3th_claims) vocab_size = len(word2id) + 1 print 'joint_train size: ', joint_train_size, ' joint_dev size: ', joint_test_size, ' joint_test size: ', joint_dev_size print 'train size: ', train_size, ' dev size: ', test_size, ' test size: ', dev_size print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() sents_labels = T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() joint_sents_ids = T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask = T.ftensor3() joint_sents_labels = T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_att_conv_W, task1_att_conv_b = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_conv_W_context, task1_conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b, task1_conv_W_context, conv_W_context ] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) ''' attentive conv for task1 ''' task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=task1_att_conv_W, b=task1_att_conv_b, W_context=task1_conv_W_context, b_context=task1_conv_b_context) task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) LR_input = T.concatenate([ concate_2_matrix, task1_attentive_sent_embeddings_l, task1_attentive_sent_embeddings_r ], axis=1) LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2 # LR_input = concate_2_matrix # LR_input_size = hidden_size[0]*2+1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) # loss = -T.mean(T.log(prob_pos)) #f1 as loss batch_overlap = T.sum(sents_labels * inter_matrix, axis=1) batch_recall = batch_overlap / T.sum(sents_labels, axis=1) batch_precision = batch_overlap / T.sum(inter_matrix, axis=1) batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall + batch_precision) loss = -T.mean(T.log(batch_f1)) # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean() ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[ joint_claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_sent_embeddings = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_batch_sent_emb = joint_sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) joint_premise_emb = T.sum(joint_batch_sent_emb * joint_sents_labels.dimshuffle(0, 1, 'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=joint_claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) joint_sents_dot = T.batched_dot( joint_sents_tensor3, joint_sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) joint_sents_dot_2_matrix = T.nnet.softmax( joint_sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) joint_sents_context = T.batched_dot( joint_sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= joint_add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0), mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle( 0, 1, 'x') masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle( 0, 1, 'x') fine_max = T.concatenate([ T.max(masked_sents_attconv, axis=1), T.max(masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector joint_loss = joint_layer_LR.negative_log_likelihood( joint_labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) test_premise_emb = T.sum(batch_sent_emb * masked_inter_matrix.dimshuffle(0, 1, 'x'), axis=1) test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings], axis=1) #fine-maxsum sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax( sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) sents_context = T.batched_dot( sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents + sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_fine_max = T.concatenate([ T.max(test_masked_sents_attconv, axis=1), T.max(test_masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1) test_LR_input_size = joint_LR_input_size test_layer_LR = LogisticRegression( rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings] + NN_para + LR_para + joint_LR_para cost = loss + joint_loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False joint_n_train_batches = joint_train_size / batch_size joint_train_batch_start = list( np.arange(joint_n_train_batches) * batch_size) + [joint_train_size - batch_size] n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] n_test_3th_batches = test_3th_size / batch_size test_3th_batch_start = list(np.arange(n_test_3th_batches) * batch_size) + [test_3th_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_dev_3th_batches = dev_3th_size / batch_size dev_3th_batch_start = list(np.arange(n_dev_3th_batches) * batch_size) + [dev_3th_size - batch_size] max_acc = 0.0 max_test_f1 = 0.0 max_test_acc = 0.0 cost_i = 0.0 joint_train_indices = range(joint_train_size) train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( joint_train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed random.Random(100).shuffle(train_indices) iter_accu = 0 for joint_batch_id in joint_train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1 iter_accu += 1 joint_train_id_batch = joint_train_indices[ joint_batch_id:joint_batch_id + batch_size] for i in range(3): batch_id = random.choice(train_batch_start) train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch], #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels joint_train_sents[joint_train_id_batch], joint_train_sent_masks[joint_train_id_batch], joint_train_sent_labels[joint_train_id_batch], joint_train_claims[joint_train_id_batch], joint_train_claim_mask[joint_train_id_batch], joint_train_labels[joint_train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for test_batch_id in test_batch_start: # for each test batch batch_prob, error_i, pred_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_sent_masks[test_batch_id:test_batch_id + batch_size], test_sent_labels[test_batch_id:test_batch_id + batch_size], test_claims[test_batch_id:test_batch_id + batch_size], test_claim_mask[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size]) error_sum += error_i batch_sent_labels = test_sent_labels[ test_batch_id:test_batch_id + batch_size] batch_sent_names = test_sent_names[ test_batch_id:test_batch_id + batch_size] batch_ground_names = test_ground_names[ test_batch_id:test_batch_id + batch_size] batch_ground_labels = test_labels[ test_batch_id:test_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 # test_f1=f1_sum/(len(test_batch_start)*batch_size) for test_batch_id in test_3th_batch_start: # for each test batch _, error_i, pred_i = test_model( test_3th_sents[test_batch_id:test_batch_id + batch_size], test_3th_sent_masks[test_batch_id:test_batch_id + batch_size], test_3th_sent_labels[test_batch_id:test_batch_id + batch_size], test_3th_claims[test_batch_id:test_batch_id + batch_size], test_3th_claim_mask[test_batch_id:test_batch_id + batch_size], test_3th_labels[test_batch_id:test_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 if f1 > max_test_f1 or strict_score > max_test_acc: if f1 > max_test_f1: max_test_f1 = f1 if strict_score > max_test_acc: max_test_acc = strict_score #test print '....................\n' f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for dev_batch_id in dev_batch_start: # for each test batch batch_prob, error_i, pred_i = dev_model( dev_sents[dev_batch_id:dev_batch_id + batch_size], dev_sent_masks[dev_batch_id:dev_batch_id + batch_size], dev_sent_labels[dev_batch_id:dev_batch_id + batch_size], dev_claims[dev_batch_id:dev_batch_id + batch_size], dev_claim_mask[dev_batch_id:dev_batch_id + batch_size], dev_labels[dev_batch_id:dev_batch_id + batch_size]) error_sum += error_i batch_sent_labels = dev_sent_labels[ dev_batch_id:dev_batch_id + batch_size] batch_sent_names = dev_sent_names[ dev_batch_id:dev_batch_id + batch_size] batch_ground_names = dev_ground_names[ dev_batch_id:dev_batch_id + batch_size] batch_ground_labels = dev_labels[ dev_batch_id:dev_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 # test_f1=f1_sum/(len(test_batch_start)*batch_size) for dev_batch_id in dev_3th_batch_start: # for each test batch _, error_i, pred_i = dev_model( dev_3th_sents[dev_batch_id:dev_batch_id + batch_size], dev_3th_sent_masks[dev_batch_id:dev_batch_id + batch_size], dev_3th_sent_labels[dev_batch_id:dev_batch_id + batch_size], dev_3th_claims[dev_batch_id:dev_batch_id + batch_size], dev_3th_claim_mask[dev_batch_id:dev_batch_id + batch_size], dev_3th_labels[dev_batch_id:dev_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 14], batch_size=70, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5], hidden_units=50, L2_weight=0.000005, dropout_p=0.2, useEmb=1, task=2): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' rng = numpy.random.RandomState(23455) datasets, embedding_size, embeddings=read_data_HK(root+str(task)+'classes/train.txt', root+str(task)+'classes/dev.txt', root+str(task)+'classes/test.txt', embeddingPath,60, useEmb) #print embeddings.get_value(borrow=True)[:4] #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60) #datasets = load_data(dataset) indices_train, trainY, trainLengths = datasets[0] indices_dev, devY, devLengths = datasets[1] indices_test, testY, testLengths = datasets[2] n_train_batches=indices_train.shape[0]/batch_size n_valid_batches=indices_dev.shape[0]/batch_size n_test_batches=indices_test.shape[0]/batch_size train_batch_start=[] dev_batch_start=[] test_batch_start=[] if useAllSamples: train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[indices_train.shape[0]-batch_size] dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size)+[indices_dev.shape[0]-batch_size] test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[indices_test.shape[0]-batch_size] n_train_batches=n_train_batches+1 n_valid_batches=n_valid_batches+1 n_test_batches=n_test_batches+1 else: train_batch_start=list(numpy.arange(n_train_batches)*batch_size) dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True) indices_train_theano=T.cast(indices_train_theano, 'int32') indices_dev_theano=T.cast(indices_dev_theano, 'int32') indices_test_theano=T.cast(indices_test_theano, 'int32') ''' indices_train_theano=theano.shared(indices_train, borrow=True) indices_dev_theano=theano.shared(indices_dev, borrow=True) indices_test_theano=theano.shared(indices_test, borrow=True) ''' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix('x_index') # now, x is the index matrix, must be integer y = T.ivector('y') z = T.ivector('z') # sentence lengths x=embeddings[x_index.flatten()].reshape((batch_size,60, embedding_size)).transpose(0, 2, 1).flatten()#flatten(1) means keep col, then scan rows #x_print=theano.printing.Print('x')(x[:,1500:]) ishape = (embedding_size, 60) # this is the size of MNIST images filter_size1=(1,filter_size[0]) filter_size2=(1,filter_size[1]) poolsize1=(1, ishape[1]-filter_size1[1]+1) poolsize2=(1, kmax-filter_size2[1]+1) #(1,6) dynamic_lengths=T.maximum(ktop,z/2+1) # dynamic k-max pooling ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1])) #layer0_input_print=theano.printing.Print('input')(layer0_input[:,:,:,30:]) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) ''' layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1) ''' layer0 = Conv_DynamicK_PoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, sentLengths=z, k=dynamic_lengths, unifiedWidth=kmax) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) ''' layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0], kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop) ''' ''' layer1 = ConvFoldPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ishape[0], kmax), filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop) ''' # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer0.output.flatten(2) #wenpeng2=theano.printing.Print('layer2_input')(layer2_input) # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * embedding_size*kmax, n_out=hidden_units, activation=T.tanh) dropout=dropout_from_layer(rng, layer2.output, dropout_p) #dropout layer3 = LogisticRegression(rng, input=dropout, n_in=hidden_units, n_out=task) #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2) # the cost we minimize during training is the NLL of the model #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer0.W).sum()+abs(embeddings).sum() L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum() +(layer0.W** 2).sum()+(embeddings**2).sum() cost = layer3.negative_log_likelihood(y)+L2_weight*L2_reg #cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], layer3.errors(y), givens={ x_index: indices_test_theano[index: index + batch_size], y: testY[index: index + batch_size], z: testLengths[index: index + batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x_index: indices_dev_theano[index: index + batch_size], y: devY[index: index + batch_size], z: devLengths[index: index + batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer0.params+[embeddings] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. ''' updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) ''' updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(embedding_size))))) #AdaGrad else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index], [cost,layer3.errors(y)], updates=updates, givens={ x_index: indices_train_theano[index: index + batch_size], y: trainY[index: index + batch_size], z: trainLengths[index: index + batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 cost_ij, error_ij = train_model(batch_start) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' cost: '+str(cost_ij)+' error: '+str(error_ij) if iter % validation_frequency == 0: # compute zero-one loss on validation set #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, rng, n_in=784, n_hidden=[500, 500], n_out=10, lambda_reg=0.001, alpha_reg=0.001): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_in: int :param n_in: dimension of the input to the DBN :type n_hidden: list of ints :param n_hidden: intermediate layers size, must contain at least one value :type n_out: int :param n_out: dimension of the output of the network :type lambda_reg: float :param lambda_reg: paramter to control the sparsity of weights by l_1 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ). Thus, the larger lambda_reg is, the sparser the weights are. :type alpha_reg: float :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ), Thus, the smaller alpha_reg is, the smoother the weights are. """ self.hidden_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(n_hidden) assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') # the data, each row is a sample self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_in else: input_size = n_hidden[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.hidden_layers[-1].output sigmoid_layer = HiddenLayer(rng=rng, input=layer_input, n_in=input_size, n_out=n_hidden[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.hidden_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=rng, theano_rng=None, input=layer_input, n_visible=input_size, n_hidden=n_hidden[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if self.n_layers > 0: self.logRegressionLayer = LogisticRegression( input=self.hidden_layers[-1].output, n_in=n_hidden[-1], n_out=n_out) else: self.logRegressionLayer = LogisticRegression(input=self.x, n_in=input_size, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # regularization L1s = [] L2_sqrs = [] for i in range(self.n_layers): L1s.append(abs(self.hidden_layers[i].W).sum()) L2_sqrs.append((self.hidden_layers[i].W**2).sum()) L1s.append(abs(self.logRegressionLayer.W).sum()) L2_sqrs.append((self.logRegressionLayer.W**2).sum()) self.L1 = T.sum(L1s) self.L2_sqr = T.sum(L2_sqrs) # compute the cost for second phase of training, # defined as the negative log likelihood self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood( self.y) self.cost=self.negative_log_likelihood + \ lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr + alpha_reg*self.L1) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred
imageSize2 = (imageSize1 - W1.shape[2] + 1)/poolSize # construct a fully-connected sigmoidal layer n_in = nkerns[1] * imageSize2 * imageSize2 print n_in, "n_in" upLayer2 = HiddenLayer(rng, input=upLayer2_input, n_in=nkerns[1] * imageSize2 * imageSize2, n_out=n_out_layer2) upLayer2.W.set_value(W2,borrow=True) upLayer2.b.set_value(b2,borrow=True) upLayer3 = LogisticRegression(input=upLayer2.out(), n_in=n_out_layer2, n_out=7) upLayer3.W.set_value(W3,borrow=True) upLayer3.b.set_value(b3,borrow=True) downLayer3 = LogisticRegression(input=upLayer3.out(), n_in=7, n_out=n_out_layer2) # ##TODO: transpose or inverse W3Down = numpy.transpose(W3) W3Down = W3Down[::-1,::-1] # # downLayer3.W.set_value(W3Down,borrow=True) downLayer3.b.set_value(b3,borrow=True) #
def __init__(self, rng, input, layer_sizes, temperature, activations, use_bias=True): #rectified_linear_activation = lambda x: T.maximum(0.0, x) # Set up all the hidden layers weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:]) self.layers = [] self.dropout_layers = [] next_layer_input = input #first_layer = True # dropout the input #next_dropout_layer_input = _bobakout_from_layer(rng, input, temperature) next_dropout_layer_input = input layer_counter = 0 for n_in, n_out in weight_matrix_sizes[:-1]: #next_dropout_layer = DropoutHiddenLayer(rng=rng, next_dropout_layer = BobakoutHiddenLayer( rng=rng, input=next_dropout_layer_input, # activation=activations[layer_counter], activation=None, n_in=n_in, n_out=n_out, use_bias=use_bias, temperature=temperature) self.dropout_layers.append(next_dropout_layer) next_dropout_layer_input = next_dropout_layer.output # Reuse the paramters from the dropout layer here, in a different # path through the graph. next_layer = HiddenLayer( rng=rng, input=next_layer_input, activation=activations[layer_counter], # scale the weight matrix W with (1-p) W=next_dropout_layer.W, b=next_dropout_layer.b, n_in=n_in, n_out=n_out, use_bias=use_bias) self.layers.append(next_layer) next_layer_input = next_layer.output #first_layer = False layer_counter += 1 # Set up the output layer n_in, n_out = weight_matrix_sizes[-1] dropout_output_layer = LogisticRegression( input=next_dropout_layer_input, n_in=n_in, n_out=n_out) self.dropout_layers.append(dropout_output_layer) # Again, reuse paramters in the dropout output. output_layer = LogisticRegression( input=next_layer_input, # scale the weight matrix W with (1-p) W=dropout_output_layer.W, b=dropout_output_layer.b, n_in=n_in, n_out=n_out) self.layers.append(output_layer) # Use the negative log likelihood of the logistic regression layer as # the objective. self.dropout_negative_log_likelihood = self.dropout_layers[ -1].negative_log_likelihood self.dropout_errors = self.dropout_layers[-1].errors self.negative_log_likelihood = self.layers[-1].negative_log_likelihood self.errors = self.layers[-1].errors # Grab all the parameters together. self.params = [ param for layer in self.dropout_layers for param in layer.params ]
class SdA(object): """Stacked denoising auto-encoder class (SdA) A stacked denoising autoencoder model is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the SdA, and the hidden layer of the last dA represents the output. Note that after pretraining, the SdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.recall = self.logLayer.recall(self.y) def pretraining_functions(self, train_set_x, batch_size): ''' Generates a list of functions, each of them implementing one step in training the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a [mini]batch :type learning_rate: float :param learning_rate: learning rate used during training for any of the dA layers ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: # get the cost and the updates list cost, updates = dA.get_cost_updates(corruption_level, learning_rate) # compile the theano function fn = theano.function(inputs=[index, theano.Param(corruption_level, default=0.2), theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin: batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function(inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}, name='train') test_score_i = theano.function([index], self.errors, givens={ self.x: test_set_x[index * batch_size: (index + 1) * batch_size], self.y: test_set_y[index * batch_size: (index + 1) * batch_size]}, name='test') valid_score_i = theano.function([index], self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}, name='valid') test_recall_i = theano.function([index], self.recall, givens={ self.x: test_set_x[index * batch_size: (index + 1) * batch_size], self.y: test_set_y[index * batch_size: (index + 1) * batch_size]}, name='test_recall') # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] # Create a function that scans the entire test set for recall def test_recall(): return [test_recall_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score, test_recall
def perturb_random(params, shape, oldoutput, nkerns=[20, 50], batch_size=500): print '... building the model' rng = numpy.random.RandomState(23455) x = T.tensor4() # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) f = theano.function(inputs=[x], outputs=[layer2.output, layer3.y_pred]) layer3.W.set_value(params[-1][0]) layer3.b.set_value(params[-1][1]) layer2.W.set_value(params[-1][2]) layer2.b.set_value(params[-1][3]) layer1.W.set_value(params[-1][4]) layer1.b.set_value(params[-1][5]) layer0.W.set_value(params[-1][6]) layer0.b.set_value(params[-1][7]) # perturb 500 shapes at each iteration, with ptimes iterations perturbed = numpy.tile(shape[0], (500, 1)) oldoutputs = numpy.tile(oldoutput, (500, 1)) label = shape[1] ptimes = 500 imagelength = numpy.sqrt(numpy.sum(shape[0]**2)) outputlength = numpy.sqrt(numpy.sum(oldoutput**2)) p = [] s = [] for i in range(ptimes): print 'perturbing ' + str(i) + ' ......' perturbation = numpy.random.normal(0, 0.15, perturbed.shape) perturblength = numpy.sqrt(numpy.sum(perturbation**2, axis=1)) shapes = perturbed + perturbation outputs, labels = f(shapes.reshape(500, 1, 28, 28)) distances = numpy.sum((outputs - oldoutputs)**2, axis=1) pos = numpy.argmax(distances) print 'distance ' + str(numpy.sqrt(distances[pos])) pert = {} pert['perturbation'] = perturbation[pos] pert['plength'] = perturblength[pos] pert['ilength'] = imagelength pert['olength'] = outputlength pert['distance'] = numpy.sqrt(distances[pos]) pert['output'] = outputs[pos] pert['label'] = labels[pos] p.append(pert) if len(numpy.nonzero(labels != label)[0]) != 0: print 'success!' + str(label) + ' ' pos = numpy.nonzero(labels != label)[0][0] print labels[pos] pert = {} pert['perturbation'] = perturbation[pos] pert['plength'] = perturblength[pos] pert['ilength'] = imagelength pert['olength'] = outputlength pert['distance'] = numpy.sqrt(distances[pos]) pert['output'] = outputs[pos] pert['label'] = labels[pos] s.append(pert) return p, s
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=0.0000001, extra_size=4, emb_size=300, batch_size=50, filter_size=[3, 3], maxSentLen=40, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_wordnet_hyper_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_EVAlution_hyper_vs_all( maxSentLen, word2id) total_size = len(all_sentences_l) hold_test_size = 10000 train_size = total_size - hold_test_size train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[:train_size], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) NN_para = [conv_W, conv_b, conv_W_context] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r "form input to LR classifier" LR_input = T.concatenate( [attentive_sent_embeddings_l, attentive_sent_embeddings_r], axis=1) LR_input_size = 2 * hidden_size[1] U_a = create_ensemble_para( rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [init_embeddings] + NN_para + LR_para cost = loss updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size if n_test_remain != 0: test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] else: test_batch_start = list(np.arange(n_test_batches) * batch_size) # max_acc_dev=0.0 max_ap_test = 0.0 max_ap_topk_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] probs = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_i, prob_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_i) probs += list(prob_i) if n_test_remain != 0: probs = probs[:(len(test_batch_start) - 1) * batch_size] + probs[-n_test_remain:] assert len(test_labels) == len(probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, probs, k=100) if test_ap > max_ap_test: max_ap_test = test_ap if test_ap_top100 > max_ap_topk_test: max_ap_topk_test = test_ap_top100 print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=10, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_noMT_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_labels, word2id = load_il9_NI_test( word2id, maxSentLen) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) test_labels = np.asarray(test_labels, dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-ENG-multicca.300.ENG.txt', emb_root + '100k-SWA-multicca.d300.SWA.txt', emb_root + '100k-IL9-multicca.d300.IL9.txt' ], 300) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(930508) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. # Parameterizing n_feature = train_set_x.get_value().shape[1] matrix_dim = numpy.sqrt(n_feature) matrix_dim = matrix_dim.astype('int8') layer0_input = x.reshape((batch_size, 1, matrix_dim, matrix_dim)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape = (batch_size, 1, matrix_dim, matrix_dim), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4) temp1 = (matrix_dim-5+1)/2 layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape = (batch_size, nkerns[0],temp1,temp1), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer temp2 = (temp1-5+1)/2 layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * temp2 * temp2, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer n_out = max(train_set_y.eval()) - min(train_set_y.eval()) + 1 # print n_out layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index # if iter % 10 == 0: # print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) '''print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.))''' # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) '''print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.))''' print test_score * 100. if patience <= iter: done_looping = True break end_time = time.clock() '''print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.))a''' print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] } ) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] } ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] } ) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in range(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in range(n_test_batches)] return train_fn, valid_score, test_score
def evaluate_lenet5(datasets, imgh, imgw, nclass, learning_rate=0.01, d=0.0003, n_epochs=500, nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :rtype : object :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nk+++++++++++++++++++++++++++++++++erns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 # x = T.matrix('x') # the data is presented as rasterized images x = T.tensor4('x') y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. # layer0_input = x.reshape((batch_size, 3, 60, 40)) layer0_input = x.reshape((batch_size, 3, imgh, imgw)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (60-5+1 , 40-5+1) = (56, 36) # maxpooling reduces this further to (56/2, 36/2) = (28, 18) # 4D output tensor is thus of shape (batch_size, nkerns[0], 28, 18) # image_shape=(batch_size, 3, 60, 40), layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 3, imgh, imgw), filter_shape=(nkerns[0], 3, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (28-5+1, 18-5+1) = (24, 14) # maxpooling reduces this further to (24/2, 14/2) = (12, 7) # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 12, 7) # image_shape=(batch_size, nkerns[0], 28, 18), lh1 = (imgh - 5 + 1) / 2 lw1 = (imgw - 5 + 1) / 2 layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], lh1, lw1), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 12 * 7), # or (500, 50 * 12 * 7) = (500, 3360) with the default values. lh2 = (lh1 - 5 + 1) / 2 lw2 = (lw1 - 5 + 1) / 2 layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * lh2 * lw2, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=nclass) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model # the following code is modified to suit with the small test set size test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # theano expression to decay the learning rate across epoch current_rate = theano.tensor.fscalar('current_rate') # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - current_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index, current_rate], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50 # look at least at this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant test_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_test_loss = numpy.inf learning_rate = numpy.float32(learning_rate) best_iter = 0 start_time = time.clock() epoch = 0 done_looping = False test_error = [] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 learning_rate = learning_rate / (1 + d * (epoch - 1)) print "learning rate is %f" % learning_rate for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index, numpy.float32(learning_rate)) if (iter + 1) % test_frequency == 0: # compute zero-one loss on validation set test_losses = [test_model(i) for i in xrange(n_test_batches)] this_test_loss = numpy.mean(test_losses) test_error.append(this_test_loss) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_test_loss * 100.)) # if we got the best test score until now if this_test_loss < best_test_loss: #improve patience if loss improvement is good enough if this_test_loss < best_test_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_test_loss = this_test_loss best_iter = iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_test_loss * 100., best_iter + 1, best_test_loss * 100.)) print 'The code ran for %.2fm' % ((end_time - start_time) / 60.) return params, test_error
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
class SdA(object): """栈式自编码类(SdA) 栈式自编码模型是由若干dAs堆栈组成。第i层的dA的隐层变成第i+1层的输入。 第一层dA的输入是SdA的输入,最后一层dA的输出的SdA的输出、预训练后, SdA的运行类似普通的MLP,dAs只是用来初始化权重。 """ def __init__(self,numpy_rng,theano_rng=None,n_ins=784, hidden_layers_sizes=[500,500],n_outs=10, corruption_levels=[0.1,0.1]): """ 该类可以构造可变层数的网络 numpy_rng:numpy.random.RandomState 用于初始化权重的随机数 theano_rng: theano.tensor.shared_randomstreams.RandomStreams Theano随机生成数,如果,默认值为None, 则是由'rng' 生成的随机种子 n_ins: int SdA输入的维度 hidden_layers_sizes: lists of ints 中间层的层数列表,最少一个元素 n_out: int 网路输出量的维度 corruption_levels: list of float 每一层的corruption level """ self.sigmoid_layers=[] self.dA_layers=[] self.params=[] self.n_layers=len(hidden_layers_sizes) assert self.n_layers>0 #设定隐层数量大于0 if not theano_rng: theano_rng=RandomStreams(numpy_rng.randint(2**30)) #设定符号变量 self.x=T.matrix('x') #栅格化的图像数据 self.y=T.ivector('y') #由[int]型标签组成的一维向量 #SdA是一个MLP,降噪自编码器共享中间层的权重向量。 #首先将SdA构造为深层多感知器,然后构造每个sigmoid层。 #同时,该层的降噪自编码器也会共享权重。 #预训练过程是训练这些自编码器(同时也会改变多感知器的权重) #在微调过程,通过在MLP上采用随机梯度下降法完成SdA的训练 #构造sigmoid层 for i in xrange(self.n_layers): #输入量的大小是下层隐层单元数量(本层不是第一层) #输入量的大小是输入量的大小(本层是第一层) if i==0: input_size=n_ins else: input_size=hidden_layers_sizes[i-1] #本层的输入是下层隐层的激活(本层不是第一层); #本层的输入是SdA的输入(本层是第一层) if i==0: layer_input=self.x else: layer_input=self.sigmoid_layers[-1].output #定义sigmoid层 sigmoid_layer=HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) #将sigmoid层添加到层列表 self.sigmoid_layers.append(sigmoid_layer) #这是个哲学问题... #但是我们只想说sigmoid_layers的参数就是 #SdA的参数 #dA中可视偏置是dA的参数,而不是SdA的参数 self.params.extend(sigmoid_layer.params) #构造降噪自编码器与该层共享权重 dA_layer=dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b ) self.dA_layers.append(dA_layer) #在MLP顶部加上losgistic层 self.logLayer=LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1],n_out=n_outs) self.params.extend(self.logLayer.params) #建立函数,执行一步微调 #定义第二步训练的代价:负log函数 self.finetune_cost=self.logLayer.negative_log_likelihood(self.y) #分别对模型中参数计算梯度 #给定self.x和self.y,定义每个minibatch上的误差的符号变量 self.errors=self.logLayer.errors(self.y) def pretraining_function(self,train_set_x,batch_size): ''' 生成函数列表,每个函数执行一层中dA的训练,返回预训练的函数列表 函数输入是minibatch的索引,在所有的minibatch执行相同的训练 train_set_x: theano.tensor.TensorType 训练dA的数据点(共享变量) batch_size: int [mini]batch大小 ''' #[mini]batch的索引 index=T.lscalar('index') corruption_level=T.scalar('corruption') #corruption百分比 learning_rate=T.scalar('lr') #学习率 #batch数量 n_bathes=train_set_x.get_value(borrow=True).shape[0]/batch_size #给定index后,起始的 # batch batch_begin=index*batch_size #给定index后,结束的batch batch_end=batch_begin+batch_size pretrain_fns=[] for dA in self.dA_layers: #遍历dA #创建代价列表和更新列表 cost,updates=dA.get_cost_updates(corruption_level, learning_rate) #创建theano函数 fn=theano.function(inputs=[index, theano.Param(corruption_level,default=0.2), theano.Param(learning_rate,default=0.1)], outputs=cost, updates=updates, givens={self.x:train_set_x[batch_begin: batch_end]}) #将fn添加到函数列表 pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self,datasets,batch_size,learning_rate): ''' 创建"train"函数执行一步微调;"validate"函数计算验证集合中batch的误差; "test"函数计算测试集合中batch误差 :param datasets: list of pairs of theano.tensor.TensorType #包含所有datasets的列表,每3个元素为一个组: 依次为'train'、'valid'、'test'。每个元素又 包含两个theano变量:数据特征和标签 :param batch_size: int minibatch的大小 :param learning_rate:float 微调阶段的learning_rate :return: ''' (train_set_x,train_set_y)=datasets[0] (valid_set_x,valid_set_y)=datasets[1] (test_set_x,test_set_y)=datasets[2] #分别计算training、validation、testing的minibatch的数量 n_valid_batches=valid_set_x.get_value(borrow=True).shape[0] n_valid_batches/=batch_size n_test_batches=test_set_x.get_value(borrow=True).shape[0] n_test_batches/=batch_size index=T.lscalar('index') #[mini]batch的索引 #分别计算模型参数的梯度 gparams=T.grad(self.finetune_cost,self.params) #计算微调更新参数列表 updates=[] for param,gparam in zip(self.params,gparams): updates.append((param,param-gparam*learning_rate)) train_fn=theano.function(inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x:train_set_x[index*batch_size: (index+1)*batch_size], self.y:train_set_y[index*batch_size: (index+1)*batch_size]}, name='train') test_score_i=theano.function([index],self.errors, givens={ self.x:test_set_x[index*batch_size: (index+1)*batch_size], self.y:test_set_y[index*batch_size: (index+1)*batch_size]}, name='test') valid_score_i=theano.function([index],self.errors, givens={ self.x:valid_set_x[index*batch_size: (index+1)*batch_size], self.y:valid_set_y[index*batch_size: (index+1)*batch_size]}, name='valid') #创建函数遍历整个验证集合 def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches) ] #创建函数遍历整个测试集合 def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset=DataSet, nkerns=[cls1, cls2], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print(type(train_set_x)) #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540]) #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540]) #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540]) #train_set_x = train_set_x / 100 #valid_set_x = valid_set_x / 100 #test_set_x = test_set_x / 100 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size #n_test_batches = (n_test_batches/batch_size) + (n_test_batches % batch_size > 0) print(n_test_batches) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch Alr = T.scalar('Alr') x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (nFB, nFs) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer dFeatureV = iFMs * nFB * nFs xinp = x[:, :dFeatureV] # print (x.shahpe) layer0_input = xinp.reshape((batch_size, iFMs, nFB, nFs)) layer1H_input = x[:, dFeatureV:] # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, iFMs, nFB, nFs), filter_shape=(nkerns[0], iFMs, fsx, fsy), poolsize=(p, p)) cl2x = (nFB - fsx + 1) / p cl2y = (nFs - fsy + 1) / p layer1H = HiddenLayer(rng, input=layer1H_input, n_in=27, n_out=nhu1 / 4, activation=T.tanh) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) #layer1 = LeNetConvPoolLayer(rng, input=layer0.output, # image_shape=(batch_size, nkerns[0], cl2x, cl2y), # filter_shape=(nkerns[1], nkerns[0], fsx, 1), poolsize=(p2, 1)) #hl1 = (cl2x - fsx + 1)/p2 hl1 = cl2x * cl2y # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer0.output.flatten(2) #layer2_inputT = T.concatenate([layer2_input,x[:,dFeatureV:]],axis = 1) layer2_inputT = T.concatenate([layer2_input, layer1H.output], axis=1) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_inputT, n_in=(nkerns[0] * hl1 * 1) + nhu1 / 4, n_out=nhu1 * 2, activation=T.tanh) layer22 = HiddenLayer(rng, input=layer2.output, n_in=nhu1 * 2, n_out=nhu1, activation=T.tanh) layer23 = HiddenLayer(rng, input=layer22.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) layer23 = HiddenLayer(rng, input=layer22.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) layer24 = HiddenLayer(rng, input=layer23.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) layer25 = HiddenLayer(rng, input=layer24.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer25.output, n_in=nhu1, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) #yPred = layer3.ypred(layer2.output) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent #params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params params = layer3.params + layer25.params + layer24.params + layer23.params + layer22.params + layer2.params + layer1H.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): #updates.append((param_i, param_i - learning_rate * grad_i)) updates.append((param_i, param_i - Alr * grad_i)) train_model = theano.function( [index, Alr], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size][:], y: train_set_y[index * batch_size:(index + 1) * batch_size][:] }) ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch #best_params = None best_params = [] best_validation_loss = numpy.inf prev_validation_loss = 200 best_iter = 0 test_score = 0. start_time = time.clock() Alrc = 0.1 AlrE = 0.00001 epochC = 0 epoch = 0 done_looping = False for param in params: best_params.append(param.get_value()) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 epochC = epochC + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index, Alrc) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) lossratio = (this_validation_loss - prev_validation_loss) / (prev_validation_loss + 1) print(lossratio) print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100., Alrc)) # if we got the best validation score until now #if this_validation_loss < best_validation_loss: if lossratio <= 0.0: for i in range(len(params)): best_params[i] = params[i].get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss prev_validation_loss = this_validation_loss best_iter = iter # test it on the test set #tm = test_model(0) yP = numpy.asarray([]) test_losses = [ test_model(i)[0] for i in xrange(n_test_batches) ] for i in xrange(n_test_batches): yP = numpy.concatenate((yP, test_model(i)[1])) print(yP.shape) test_score = numpy.mean(test_losses) #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value()) #y = test_set_y.owner.inputs[0].get_value()[:2300] y = yP print(yP.shape) print(y.shape) I1 = numpy.nonzero(y == 0.0) I2 = numpy.nonzero(y == 1.0) I3 = numpy.nonzero(y == 2.0) I4 = numpy.nonzero(y == 3.0) print(I1[0].shape) print(I2[0].shape) print(I3[0].shape) print(I4[0].shape) I11 = numpy.nonzero(yP[I1[0]] == 0) I12 = numpy.nonzero(yP[I1[0]] == 1) I13 = numpy.nonzero(yP[I1[0]] == 2) I14 = numpy.nonzero(yP[I1[0]] == 3) I21 = numpy.nonzero(yP[I2[0]] == 0) I22 = numpy.nonzero(yP[I2[0]] == 1) I23 = numpy.nonzero(yP[I2[0]] == 2) I24 = numpy.nonzero(yP[I2[0]] == 3) I31 = numpy.nonzero(yP[I3[0]] == 0) I32 = numpy.nonzero(yP[I3[0]] == 1) I33 = numpy.nonzero(yP[I3[0]] == 2) I34 = numpy.nonzero(yP[I3[0]] == 3) I41 = numpy.nonzero(yP[I4[0]] == 0) I42 = numpy.nonzero(yP[I4[0]] == 1) I43 = numpy.nonzero(yP[I4[0]] == 2) I44 = numpy.nonzero(yP[I4[0]] == 3) acc1 = 100 #float(float(I11[0].size)/float(I1[0].size)) acc2 = 100 #float(float(I22[0].size)/float(I2[0].size)) if n_out == 3: acc3 = 100 #float(float(I33[0].size)/float(I3[0].size)) acc4 = 0 elif n_out == 4: acc3 = float(float(I33[0].size) / float(I3[0].size)) acc4 = float(float(I44[0].size) / float(I4[0].size)) else: acc3 = 0 acc4 = 0 print(( ' epoch %i, minibatch %i/%i, test error of ' 'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, acc4 = %f, I11 = %i, I12 = %i, I13 = %i, I14 = %i, I21 = %i, I22 = %i, I23 = %i, I24 = %i, I31 = %i, I32 = %i, I33 = %i, I34 = %i, I41 = %i, I42 = %i, I43 = %i, I44 = %i %%' ) % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, acc4 * 100, I11[0].size, I12[0].size, I13[0].size, I14[0].size, I21[0].size, I22[0].size, I23[0].size, I24[0].size, I31[0].size, I32[0].size, I33[0].size, I34[0].size, I41[0].size, I42[0].size, I43[0].size, I44[0].size)) #print((' epoch %i, minibatch %i/%i, test error of best ' # 'model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) else: if Alrc <= AlrE: done_looping = True break elif epochC > 40: Alrc = Alrc / 2 for param, best_param in zip(params, best_params): param.set_value(best_param) epochC = 0 #if patience <= iter: # done_looping = True # break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) #print >> sys.stderr, ('The code for file ' + # os.path.split(__file__)[1] + # ' ran for %.2fm' % ((end_time - start_time) / 60.)) OF = open(outFile, 'a') print(5, 5, 5, 5, 5, DataSet, n_out, fsx, fsy, p, cls1, cls2, nhu1, nFB, nFs, iFMs, nhus, batch_size, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, acc4 * 100, I11[0].size, I12[0].size, I13[0].size, I14[0].size, I21[0].size, I22[0].size, I23[0].size, I24[0].size, I31[0].size, I32[0].size, I33[0].size, I34[0].size, I41[0].size, I42[0].size, I43[0].size, I44[0].size, file=OF) OF.close()
def perturb_bfgs(perturbation, params, shape, oldoutput, c=1, nkerns=[20, 50], batch_size=1): #print '... building the model' rng = numpy.random.RandomState(23455) x = T.tensor4() # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=3) f = theano.function(inputs=[x], outputs=[layer2.output, layer3.y_pred]) layer3.W.set_value(params[-1][0]) layer3.b.set_value(params[-1][1]) layer2.W.set_value(params[-1][2]) layer2.b.set_value(params[-1][3]) layer1.W.set_value(params[-1][4]) layer1.b.set_value(params[-1][5]) layer0.W.set_value(params[-1][6]) layer0.b.set_value(params[-1][7]) perturbed = shape oldoutputs = oldoutput distances = 0 perturblength = numpy.sqrt(numpy.sum(perturbation**2)) shapes = perturbed + perturbation outputs, labels = f(shapes.reshape(1, 1, 28, 28)) print labels for o in oldoutputs: distances += numpy.sqrt(numpy.sum((outputs - o)**2)) distances /= len(oldoutputs) return c * perturblength + distances
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] } ) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] } ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] } ) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=50): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) # datasets = input_data_LIDC.load_data( # "../data/LIDC/resized_images-236x236/LIDC-IDRI-0003/", # "../data/LIDC/resized_images/" # "*.tiff", # '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/master_join4.csv', # '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/DICOM_metadata_extracts/', # "imageSOP_UID-filePath-dicominfo-LIDC-IDRI-0003.csv") # "*.csv" # # img_px_len_x = 32 # 236 # # img_px_len_y = 32 # 236 # img_px_len_x = 236 # img_px_len_y = 236 # lidc_n_out_malignancy = 5 # pickle_file_name = "Evans-MacBook-Pro.local-resized_images-236x236-first30.theano.pickle" # if os.path.isfile(pickle_file_name): # input_data_LIDC.esprint("Unpickling: " + pickle_file_name) # with open(pickle_file_name, "rb") as pickle_file: # datasets = pickle.load(pickle_file) # else: # datasets = input_data_LIDC.load_data( # "../data/LIDC/resized_images-236x236-first30/", # "../data/LIDC/resized_images/" # "*.tiff", # '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/master_join4.csv', # '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/DICOM_metadata_extracts/', # "*.csv") # "*.csv" # input_data_LIDC.esprint("Pickling: " + pickle_file_name) # with open(pickle_file_name, "wb") as pickle_file: # pickle.dump(datasets, pickle_file) # img_px_len_x = 32 # 236 # img_px_len_y = 32 # 236 # # img_px_len_x = 236 # # img_px_len_y = 236 # lidc_n_out_malignancy = 5 pickle_file_name = "Evans-MacBook-Pro.local-resized_images-32x32.theano.pickle" if os.path.isfile(pickle_file_name): input_data_LIDC.esprint("Unpickling: " + pickle_file_name) with open(pickle_file_name, "rb") as pickle_file: datasets = pickle.load(pickle_file) else: datasets = input_data_LIDC.load_data( "../data/LIDC/resized_images-32x32/", # "../data/LIDC/resized_images/" "*.tiff", '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/master_join4.csv', '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/DICOM_metadata_extracts/', "*.csv") # "*.csv" input_data_LIDC.esprint("Pickling: " + pickle_file_name) with open(pickle_file_name, "wb") as pickle_file: pickle.dump(datasets, pickle_file) img_px_len_x = 32 # 236 img_px_len_y = 32 # 236 # img_px_len_x = 236 # img_px_len_y = 236 lidc_n_out_malignancy = 5 train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # train_set_y.flatten() # valid_set_y.flatten() # test_set_y.flatten() # input_data_LIDC.esprint("type: " + str(type(test_set_y))) # input_data_LIDC.esprint("test_set_x.shape: " + str(test_set_x.shape)) # input_data_LIDC.esprint("test_set_y.shape: " + str(test_set_y.shape)) # input_data_LIDC.esprint(theano.printing.Print('test_set_y')(test_set_y)) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size input_data_LIDC.esprint("n_train_batches = " + str(n_train_batches)) input_data_LIDC.esprint("n_valid_batches = " + str(n_valid_batches)) input_data_LIDC.esprint("n_test_batches = " + str(n_test_batches)) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels theano.pp(x) theano.pp(y) # input_data_LIDC.esprint("Set lengths" + str([len(train_set_x), len(train_set_y), len(valid_set_x), len(valid_set_y), len(test_set_x), len(test_set_y)])) # input_data_LIDC.esprint("Set lengths: " + str(len(x) + ", " + str(len(y)))) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, img_px_len_x, img_px_len_y)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_px_len_x, img_px_len_y), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=batch_size, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=lidc_n_out_malignancy) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__( self, W_sda=[None,None],#有已保存的参数的话就读入 b_sda=[None,None], nn_structure=[600,500,500,10], # 输入层、隐藏层各层节点数、输出类别数 dropout_rates=[0.5,0.5], L1_reg=0,L2_reg=0, activation=T.nnet.sigmoid, non_static=False, wordvec=None ): #print activation self.activation=activation self.sigmoid_layers = [] #隐藏层列表... self.dA_layers = [] #da层列表... self.params = [] #不包括dA层....sigmoid layers... self.n_layers = len(nn_structure)-2 #有几个隐藏层 #random num numpy_rng = numpy.random.RandomState(89677) theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x')#data self.y = T.ivector('y')#label #print wordvec self.Words=theano.shared(value=wordvec,name='Words') layer0_input= self.Words[T.cast(self.x.flatten(),dtype="int32")].\ reshape((self.x.shape[0],self.x.shape[1]*self.Words.shape[1])) #n=layer0_input.shape[1] / numpy.shape(wordvec)[0] ''' #词向量相加... avg_vec = numpy.zeros(numpy.shape(wordvec)[0],dtype=theano.config.floatX) for i in xrange(5): sub_vec=layer0_input[:,i*200:(i+1)*200] avg_vec=T.add(sub_vec,avg_vec) #avg_vec=avg_vec/5. layer0_input=avg_vec ''' # DA层和MLP层共享权重.. for i in xrange(self.n_layers): if i == 0: input_size = nn_structure[0] #input_size:每个隐藏层的输入节点个数 else: input_size = nn_structure[i] if i == 0: layer_input = layer0_input#self.x #输入数据 else: layer_input = self.sigmoid_layers[i-1].output # 后一层的输入是前一层的输出 #参数W\b随机初始化..dropout layer sigmoid_layer = dpLayer(rng=numpy_rng, input=layer_input, n_in=input_size, W=W_sda[i], b=b_sda[i], dropout_rate=dropout_rates[i], n_out=nn_structure[i+1], activation=self.activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) #隐藏层列表 self.params.extend(sigmoid_layer.params) #mlp参数w,b # 与hidden layer共享权重 dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=nn_structure[i+1], W=sigmoid_layer.W, # share weights with sigmoid layer # W是共享变量,dA预训练 bhid=sigmoid_layer.b, activation=self.activation) self.dA_layers.append(dA_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=nn_structure[-2], n_out=nn_structure[-1] ) self.params.extend(self.logLayer.params)#log参数 w,b L1=0. L2=0. for layer in self.sigmoid_layers: L1+=abs(layer.W).sum() L2+=(layer.W ** 2).sum() self.L1=(L1+abs(self.logLayer.W).sum()) self.L2_sqr=L2+(self.logLayer.W ** 2).sum() self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) \ + L1_reg * self.L1 + L2_reg * self.L2_sqr if non_static: self.params.extend([self.Words]) self.errors = self.logLayer.errors(self.y)
class DeepAutoencoder(object): def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, hidden_layers_sizes = [50,60], n_outs = 10, reconstruction_cost = 'cross_entropy',\ supervised_training = 'russ', tied_weights = False): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.reconstruction_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.tied_weights = tied_weights assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The DBN is an MLP, for which all weights of intermediate layers are shared with a # different RBM. We will first construct the DBN as a deep multilayer perceptron, and # when constructing each sigmoidal layer we also construct an RBM that shares weights # with that layer. During pretraining we will train these RBMs (which will lead # to chainging the weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the MLP. for i in xrange( self.n_layers ): # construct the sigmoidal layer # the size of the input is either the number of hidden units of the layer below or # the input size if we are on the first layer if i == 0 : input_size = n_ins else: input_size = hidden_layers_sizes[i-1] # the input to this layer is either the activation of the hidden layer below or the # input of the DBN if you are on the first layer if i == 0 : layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output # TODO(dumitru): this is temporary, get rid of this when done if i != self.n_layers-1: activation = T.nnet.sigmoid else: activation = None sigmoid_layer = HiddenLayer(rng = numpy_rng, input = layer_input, n_in = input_size, n_out = hidden_layers_sizes[i], activation = activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are going to only declare that # the parameters of the sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, n_visible = input_size, n_hidden = hidden_layers_sizes[i], W = sigmoid_layer.W, hbias = sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # Creating the reconstruction layers for i in xrange(self.n_layers,0,-1): if i == self.n_layers: layer_input = self.sigmoid_layers[-1].output else: layer_input = self.reconstruction_layers[-1].output if self.tied_weights: W = self.sigmoid_layers[i-1].W.T b = self.rbm_layers[i-1].vbias else: W = b = None # the output size if we are on the first layer if i == 1 : output_size = n_ins activation = None else: output_size = hidden_layers_sizes[i-2] activation = T.nnet.sigmoid input_size = hidden_layers_sizes[i-1] reconstruction_layer = HiddenLayer(numpy_rng, layer_input, input_size, output_size, W, b, activation) if not self.tied_weights: self.params.extend(reconstruction_layer.params) # add the layer to our list of layers self.reconstruction_layers.append(reconstruction_layer) self.xhat = T.nnet.sigmoid( self.reconstruction_layers[-1].output ) # otherwise we'll end up with a bunch of extra params and theano will complain self.reconstruction_params = list(self.params) self.global_cross_entropy = T.mean(-T.sum( self.x*T.log(self.xhat) + (1-self.x)*T.log(1-self.xhat), axis=1 )) self.global_mse = T.mean(T.sum((self.x - self.xhat)**2.0,axis=1)) if reconstruction_cost == 'cross_entropy': self.global_pretraining_cost = self.global_cross_entropy elif reconstruction_cost == 'mse': self.global_pretraining_cost = self.global_mse else: raise NotImplementedError('Invalid reconstruction error\ specified') if supervised_training == 'standard': # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(\ input = self.sigmoid_layers[-1].output,\ n_in = hidden_layers_sizes[-1], n_out = n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) # compute prediction as class whose probability is maximal in # symbolic form self.y_pred=T.argmax(self.p_y_given_x, axis=1) elif supervised_training == 'russ': # compute vector of class-membership probabilities in symbolic form p_y_given_x = T.nnet.softmax(self.sigmoid_layers[-1].output) self.y_pred=T.argmax(p_y_given_x, axis=1) self.finetune_cost = -T.mean(T.log(p_y_given_x[T.arange(self.y.shape[0]),self.y])) self.errors = T.mean(T.neq(self.y_pred, self.y)) else: print 'Unsupport supervised training method', supervised_training raise NotImplementedError def build_pretraining_functions(self, train_set_x, batch_size,k): ''' Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use weight_decay = T.scalar('weight_decay') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin+batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost,updates = rbm.get_cost_updates(learning_rate, persistent=None, k = k, weight_decay = weight_decay) # compile the theano function fn = theano.function(inputs = [index, theano.Param(learning_rate, default = 0.1), theano.Param(weight_decay, default = 0.0002)], outputs = cost, updates = updates, givens = {self.x :train_set_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_global_pretraining_functions_sgd(self, datasets, train_batch_size, learning_rate): (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x , test_set_y ) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / train_batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / train_batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.global_pretraining_cost, self.reconstruction_params) # compute list of updates updates = {} for param, gparam in zip(self.params, gparams): updates[param] = param - gparam*learning_rate costs = [self.global_pretraining_cost,self.global_mse] train_fn = build_score_fn([index],index,costs,train_set_x,train_set_y,train_batch_size,updates,self) test_score_i = build_score_fn([index],index,costs,test_set_x,test_set_y,train_batch_size,[],self) valid_score_i = build_score_fn([index],index,costs,valid_set_x,valid_set_y,train_batch_size, [],self) # Create a function that scans the entire validation set def valid_fn(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_fn(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_fn, test_fn def build_finetune_functions(self, datasets, train_batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type train_batch_size: int :param train_batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x , test_set_y ) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / train_batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / train_batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for param, gparam in zip(self.params, gparams): updates[param] = param - gparam*learning_rate costs = self.finetune_cost train_fn = build_score_fn([index],index,costs,train_set_x,train_set_y,train_batch_size,updates,self) test_score_i = build_score_fn([index],index,costs,test_set_x,test_set_y,train_batch_size,[],self) valid_score_i = build_score_fn([index],index,costs,valid_set_x,valid_set_y,train_batch_size,[],self) # Create a function that scans the entire validation set def valid_fn(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_fn(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_fn, test_fn def build_global_pretraining_functions_hf(self, datasets, train_batch_size, preconditioner, ridge, maxiterations): (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x , test_set_y ) = datasets[2] # compute number of minibatches for training, validation and testing index = T.lscalar('index') # index to a [mini]batch valid_batch_size = valid_set_x.shape[0] test_batch_size = test_set_x.shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / valid_batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / test_batch_size n_train_batches = train_set_x.get_value(borrow=True).shape[0] / train_batch_size costs = [self.global_pretraining_cost,self.global_mse] def train_fn(): givens = { self.x:train_set_x[index*train_batch_size:(index+1)*train_batch_size], self.y:train_set_y[index*train_batch_size:(index+1)*train_batch_size]} error = truncated_newton([index], self.reconstruction_layers[-1].output, costs, self.reconstruction_params, givens, maxiterations, ridge, preconditioner, n_train_batches) return error test_score_i = build_score_fn([index],index,costs,test_set_x,test_set_y,valid_batch_size,[],self) valid_score_i = build_score_fn([index],index,costs,valid_set_x,valid_set_y,test_batch_size,[],self) def valid_fn(): return [valid_score_i(i) for i in range(n_valid_batches)] def test_fn(): return [test_score_i(i) for i in range(n_test_batches)] return train_fn, valid_fn, test_fn def initialize_reconstruction_weights(self): if not self.tied_weights: updates = [] for i in xrange(self.n_layers,0,-1): updates.append((self.reconstruction_layers[self.n_layers-i].W, self.sigmoid_layers[i-1].W.T)) updates.append((self.reconstruction_layers[self.n_layers-i].b, self.rbm_layers[i-1].vbias)) f = theano.function([],[],updates = updates) f() def save_rbm_weights(self, filename): param_list = [] for i in range(self.n_layers): params = (self.rbm_layers[i].W.get_value(), self.rbm_layers[i].vbias.get_value(), self.rbm_layers[i].hbias.get_value()) param_list.append(params) import cPickle cPickle.dump(param_list,open(filename,'w'), -1) def load_rbm_weights(self, filename): import cPickle param_list = cPickle.load(open(filename,'r')) for i in range(self.n_layers): W,vbias,hbias = param_list[i] self.rbm_layers[i].W.set_value(numpy.array(W,dtype=theano.config.floatX)) self.rbm_layers[i].vbias.set_value(numpy.array(vbias,dtype=theano.config.floatX)) self.rbm_layers[i].hbias.set_value(numpy.array(hbias,dtype=theano.config.floatX))
#layer1 = LeNetConvPoolLayer(rng, input=layer0.output, # image_shape=(batch_size, nkerns[0], 12, 12), # filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) ## the HiddenLayer being fully-connected, it operates on 2D matrices of ## shape (batch_size,num_pixels) (i.e matrix of rasterized images). ## This will generate a matrix of shape (20,32*4*4) = (20,512) #layer2_input = layer1.output.flatten(2) layer2_input = layer0.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * vectorsize * 1, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) test_model_confidence = theano.function([index], layer3.results(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size]}) # load parameters
def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, hidden_layers_sizes = [50,60], n_outs = 10, reconstruction_cost = 'cross_entropy',\ supervised_training = 'russ', tied_weights = False): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.reconstruction_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.tied_weights = tied_weights assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The DBN is an MLP, for which all weights of intermediate layers are shared with a # different RBM. We will first construct the DBN as a deep multilayer perceptron, and # when constructing each sigmoidal layer we also construct an RBM that shares weights # with that layer. During pretraining we will train these RBMs (which will lead # to chainging the weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the MLP. for i in xrange( self.n_layers ): # construct the sigmoidal layer # the size of the input is either the number of hidden units of the layer below or # the input size if we are on the first layer if i == 0 : input_size = n_ins else: input_size = hidden_layers_sizes[i-1] # the input to this layer is either the activation of the hidden layer below or the # input of the DBN if you are on the first layer if i == 0 : layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output # TODO(dumitru): this is temporary, get rid of this when done if i != self.n_layers-1: activation = T.nnet.sigmoid else: activation = None sigmoid_layer = HiddenLayer(rng = numpy_rng, input = layer_input, n_in = input_size, n_out = hidden_layers_sizes[i], activation = activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are going to only declare that # the parameters of the sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, n_visible = input_size, n_hidden = hidden_layers_sizes[i], W = sigmoid_layer.W, hbias = sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # Creating the reconstruction layers for i in xrange(self.n_layers,0,-1): if i == self.n_layers: layer_input = self.sigmoid_layers[-1].output else: layer_input = self.reconstruction_layers[-1].output if self.tied_weights: W = self.sigmoid_layers[i-1].W.T b = self.rbm_layers[i-1].vbias else: W = b = None # the output size if we are on the first layer if i == 1 : output_size = n_ins activation = None else: output_size = hidden_layers_sizes[i-2] activation = T.nnet.sigmoid input_size = hidden_layers_sizes[i-1] reconstruction_layer = HiddenLayer(numpy_rng, layer_input, input_size, output_size, W, b, activation) if not self.tied_weights: self.params.extend(reconstruction_layer.params) # add the layer to our list of layers self.reconstruction_layers.append(reconstruction_layer) self.xhat = T.nnet.sigmoid( self.reconstruction_layers[-1].output ) # otherwise we'll end up with a bunch of extra params and theano will complain self.reconstruction_params = list(self.params) self.global_cross_entropy = T.mean(-T.sum( self.x*T.log(self.xhat) + (1-self.x)*T.log(1-self.xhat), axis=1 )) self.global_mse = T.mean(T.sum((self.x - self.xhat)**2.0,axis=1)) if reconstruction_cost == 'cross_entropy': self.global_pretraining_cost = self.global_cross_entropy elif reconstruction_cost == 'mse': self.global_pretraining_cost = self.global_mse else: raise NotImplementedError('Invalid reconstruction error\ specified') if supervised_training == 'standard': # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(\ input = self.sigmoid_layers[-1].output,\ n_in = hidden_layers_sizes[-1], n_out = n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) # compute prediction as class whose probability is maximal in # symbolic form self.y_pred=T.argmax(self.p_y_given_x, axis=1) elif supervised_training == 'russ': # compute vector of class-membership probabilities in symbolic form p_y_given_x = T.nnet.softmax(self.sigmoid_layers[-1].output) self.y_pred=T.argmax(p_y_given_x, axis=1) self.finetune_cost = -T.mean(T.log(p_y_given_x[T.arange(self.y.shape[0]),self.y])) self.errors = T.mean(T.neq(self.y_pred, self.y)) else: print 'Unsupport supervised training method', supervised_training raise NotImplementedError
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='../data/mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model temp_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function([index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [temp_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train( para_len_limit, q_len_limit) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_label_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test( word2id, para_len_limit, q_len_limit) test_size = len(test_para_list) if test_size != len(test_Q_list) or test_size != len( test_mask) or test_size != len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values = random_value_normal((overall_vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extraF = T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' norm_extraF = normalize_matrix(extraF) U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size) paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b] UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size) Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] W_a1 = create_ensemble_para( rng, hidden_size, hidden_size) # init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size + 3) # 3 extra features LR_b = theano.shared( value=numpy.zeros((2, ), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras = [W_a1, W_a2, U_a, LR_b] params = [embeddings] + paragraph_para + Q_para + attention_paras load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params) paragraph_input = embeddings[paragraph.flatten()].reshape( (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose( (0, 2, 1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input = T.concatenate( [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1) paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size, U=U1, W=W1, b=b1, Ub=U1_b, Wb=W1_b, bb=b1_b) para_reps = paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape( (questions.shape[0], questions.shape[1], emb_size)).transpose( (0, 2, 1)) #(#questions, emb_size, maxsenlength) questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ, W=WQ, b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) questions_reps_tensor = questions_model.output_tensor #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor #use CNN for question modeling # Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5)) # Q_conv_para=[conv_W, conv_b] # conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4, # image_shape=(batch_size, 1, emb_size, q_len_limit), # filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b) # conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1) # gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1)) # masked_conv_output=conv_output*gru_mask # questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size)) # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix = para_matrix.T interaction_matrix = T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix = T.nnet.softmax(interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len) batch_q_reps, updates = theano.scan( fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor ]) #batch_q_reps (batch, hidden, para_len) #attention distributions norm_W_a1 = normalize_matrix(W_a1) norm_W_a2 = normalize_matrix(W_a2) norm_U_a = normalize_matrix(U_a) transformed_para_reps = T.maximum( T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0) #relu transformed_q_reps = T.maximum( T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both = transformed_para_reps + transformed_q_reps # U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size) # U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size) # accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] # accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b) # accu_both=accumu_model.output_tensor.transpose((0,2,1)) prior_att = T.concatenate([add_both, norm_extraF], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices = para_mask.flatten().nonzero()[0] layer3 = LogisticRegression(rng, input=prior_att.reshape( (batch_size * prior_att.shape[1], hidden_size + 3)), n_in=hidden_size + 3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.sum( T.log(layer3.p_y_given_x) [valid_indices, labels.flatten()[valid_indices]]) #[T.arange(y.shape[0]), y]) distributions = layer3.p_y_given_x[:, -1].reshape( (batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) # masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask masked_dis = distributions * para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] L2_reg = L2norm_paraList( [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost = error #+ConvGRU_1.error# accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [paragraph, questions, labels, para_mask, q_mask, extraF], cost, updates=updates, on_unused_input='ignore') test_model = theano.function( [paragraph, questions, para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size # remain_test=test_size%batch_size test_batch_start = list( numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i += train_model( np.asarray([ train_para_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_Q_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_label_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_para_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_feature_matrixlist[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX)) #print iter if iter % 10 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() exact_match = 0.0 F1_match = 0.0 q_amount = 0 for test_para_id in test_batch_start: distribution_matrix = test_model( np.asarray(test_para_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray( test_feature_matrixlist[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list = test_text_list[ test_para_id:test_para_id + batch_size] para_gold_ansset_list = q_ansSet_list[ test_para_id:test_para_id + batch_size] paralist_extra_features = test_feature_matrixlist[ test_para_id:test_para_id + batch_size] sub_para_mask = test_para_mask[test_para_id:test_para_id + batch_size] para_len = len(test_para_wordlist_list[0]) if para_len != len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len( distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount += batch_size # print q_size # print test_para_word_list Q_list_inword = test_Q_list_word[ test_para_id:test_para_id + batch_size] for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans = extract_ansList_attentionList( test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) q_gold_ans_set = para_gold_ansset_list[q] # print test_para_wordlist_list[q] # print Q_list_inword[q] # print pred_ans.encode('utf8'), q_gold_ans_set if pred_ans in q_gold_ans_set: exact_match += 1 F1 = MacroF1(pred_ans, q_gold_ans_set) F1_match += F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc = F1_match / q_amount exact_acc = exact_match / q_amount if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc if max_exact_acc > max_EM: store_model_to_file( rootPath + 'Best_Paras_conv_' + str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.recall = self.logLayer.recall(self.y)
def build_network(self): # allocate symbolic variables for the data self.x = dtensor5Float( 'x') # the data is presented as rasterized images self.y = dtensor5IntVector('y') # and again self.training_data = theano.shared(np.zeros(self.input_shape, dtype=floatX), borrow=True) self.training_data_labels = theano.shared(np.zeros( (self.input_shape[0], ), dtype=dtensor5IntVector), borrow=True) self.testing_data = theano.shared(np.zeros(self.input_shape, dtype=floatX), borrow=True) self.testing_data_labels = theano.shared(np.zeros( (self.input_shape[0], ), dtype=dtensor5IntVector), borrow=True) input_to_layer = self.x input_shape_to_layer = self.input_shape for i in range(0, len(self.convolution_layer_shapes)): print("Setting input layer shape to") print(input_shape_to_layer) layer = LeNetConvPoolLayer( self.rng, input=input_to_layer, image_shape=input_shape_to_layer, filter_shape=self.convolution_layer_shapes[i], poolsize=self.pooling_layer_shapes[i]) self.layers.append(layer) input_to_layer = layer.output input_shape_to_layer = layer.output_shape() layer2_input = input_to_layer.flatten(2) # construct a fully-connected sigmoidal layer self.layer2 = HiddenLayer(self.rng, input=layer2_input, n_in=np.prod(input_shape_to_layer[1:]), n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer self.layer3 = LogisticRegression(input=self.layer2.output, n_in=500, n_out=self.n_classes) #self.vy = dtensor5IntVector('vy') # and again self.batch_x = dtensor5Float( 'batch_x') # the data is presented as rasterized images self.batch_y = dtensor5IntVector('batch_y') # and again self.validate_model = theano.function( inputs=[], outputs=self.layer3.errors(self.y), givens={ self.x: self.testing_data, self.y: self.testing_data_labels }) # create a list of all model parameters to be fit by gradient descent self.params = self.layer3.params + self.layer2.params for i in self.layers: self.params += i.params #self.params = self.layers[0].params # the cost we minimize during training is the NLL of the model self.cost = self.layer3.negative_log_likelihood(self.y) # create a list of gradients for all model parameters self.grads = T.grad(self.cost, self.params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. self.updates = [(param_i, param_i - self.learning_rate * grad_i) for param_i, grad_i in zip(self.params, self.grads)] self.train_model = theano.function(inputs=[], outputs=self.cost, updates=self.updates, givens={ self.x: self.training_data, self.y: self.training_data_labels })