class MixtureOfExperts: """ The Mixture of Experts model""" def __init__(self, input_dim, nb_experts, output_dim): self.nb_experts = nb_experts self.output_dim = output_dim self.gates = LogisticRegression(input_dim, nb_experts) self.experts = [LogisticRegression(input_dim, output_dim) for k in range(nb_experts)] def pz_given_x(self, x): return self.gates.py_given_x(x) def py_given_x(self, x): pz = self.gates.py_given_x(x) py = np.zeros((x.shape[0], self.output_dim)) for z in range(self.nb_experts): pzb = np.tile(np.expand_dims(pz[:, z], axis=1), (1, self.output_dim)) py += pzb*self.experts[z].py_given_x(x) return py def py_given_xz(self, x, z): return self.experts[z].py_given_x(x) def lik_y_for_every_z(self, x, y): py = np.zeros((x.shape[0], self.nb_experts)) for z in range(self.nb_experts): py[:, z] = np.sum(y*self.py_given_xz(x, z), axis=1) return py def pz_given_xy(self, x, y): pz_given_x = self.pz_given_x(x) lik_y_forallz = self.lik_y_for_every_z(x, y) pz_given_xy = lik_y_forallz*pz_given_x renorm = np.tile(np.expand_dims(np.sum(pz_given_xy, axis=1), axis=1), (1, pz_given_xy.shape[1])) pz_given_xy = pz_given_xy/renorm return pz_given_xy def sample_y_given_x(self,x): py = self.py_given_x(x) y = np.array([np.random.multinomial(1,py[i,:]) for i in range(x.shape[0])]) return y def log_likelihood(self, x, y): lik_y = self.lik_y_for_every_z(x, y) pz_given_x = self.pz_given_x(x) return np.sum(np.log(np.sum(pz_given_x*lik_y, 1))) def fit(self, x, y, method='CG', max_iter=15): """ The model is trained using Generalized Expectation-Maximization. In the Maximization step the Conjugate-Gradient algorithm provided by scipy.optimize is used by default. """ if type(y) == type([]): y = np.eye(self.output_dim)[y] ll, Q1, Q2 = expectation_maximization2(self, x, y, max_iter=max_iter) return ll, Q1, Q2
def __init__(self, input_dim, nb_experts, output_dim): self.nb_experts = nb_experts self.output_dim = output_dim self.gates = LogisticRegression(input_dim, nb_experts) self.experts = [ LogisticRegression(input_dim, output_dim) for k in range(nb_experts) ]
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = theano_rng = RandomStreams(numpy_rng.randint(2**30)) self.x = T.matrix('x') self.y = T.ivector('y') for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likehood(self.y) self.errors = self.logLayer.errors(self.y)
def main(): X_train, y_train, X_test, y_test = load_income('./income.csv') lr = LogisticRegression(C=1000, lr_decay='step').fit(X_train, y_train) # lr.score(X_train, y_train) # lr.score(X_test, y_test) y_pred = lr.predict(X_train) print('\n==> train:\n', classification_report(y_train, y_pred)) y_pred = lr.predict(X_test) print('\n==> test:\n', classification_report(y_test, y_pred))
def test_lr_newton_method(): X, y = read_data() lr_clf = LogisticRegression(solver="newton_method") lr_clf.fit(X, y) # test intercept intercept = lr_clf.intercept_ assert (abs(intercept - -2.618) < 0.01) # test coefficient coef = lr_clf.coef_ assert (abs(coef[0] - 0.76) < 0.01) assert (abs(coef[1] - 1.17) < 0.01)
def __init__(self, rng, input, n_in, n_hidden, n_out): self.hiddenLayer = HiddenLayer( rng = rng, input = input, n_in = n_in, n_out = n_hidden, activation = T.tanh ) self.logRegressionLayer = LogisticRegression( input = self.hiddenLayer.output, n_in = n_hidden, n_out = n_out ) self.L1 = ( abs(self.hiddenLayer.W).sum()+abs(self.logRegressionLayer.W).sum() ) self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum()+(self.logRegressionLayer.W ** 2).sum() ) self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likehood ) self.errors = self.logRegressionLayer.errors self.params = self.hiddenLayer.params + self.logRegressionLayer.params self.input = input
def __init__(self, rng, input, n_hidden_out, n_out, nkerns, batch_size): self.layer0 = LeNetConvPoolLayer(rng, input=input.reshape( (batch_size, 1, 28, 28)), image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) self.layer1 = LeNetConvPoolLayer(rng, input=self.layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) self.layer2 = HiddenLayer(rng, input=self.layer1.output.flatten(2), n_in=nkerns[1] * 4 * 4, n_out=n_hidden_out, activation=T.tanh) self.logRegressionLayer = LogisticRegression(input=self.layer2.output, n_in=n_hidden_out, n_out=n_out) self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likehood) self.errors = self.logRegressionLayer.errors self.params = self.layer0.params + self.layer1.params + self.layer2.params + self.logRegressionLayer.params self.input = input
def test_lr_stochastic_gradient_descent(): X, y = read_data() lr_clf = LogisticRegression(learning_rate=0.001, max_iter=10000, solver="stochastic_gradient_descent") lr_clf.fit(X, y) # test intercept intercept = lr_clf.intercept_ assert (abs(intercept - -2.618) < 0.01) # test coefficient coef = lr_clf.coef_ assert (abs(coef[0] - 0.76) < 0.01) assert (abs(coef[1] - 1.17) < 0.01)
def __init__(self,input=None, y=None, Cparams=None, Mparams=None): c_w0, c_b0, c_w1, c_b1, c_w2, c_b2, c_w3, c_b3 = Cparams m_w1, m_b1, o_w1, o_b1 = Mparams c_layer0 = LeNetConvPoolLayer(input=input, filter_shape=filter_shape0, image_shape=image_shape0,W=c_w0, b=c_b0, poolsize=poolsize0) c_layer1 = LeNetConvPoolLayer(input=c_layer0.output, filter_shape=filter_shape1, image_shape=image_shape1,W=c_w1, b=c_b1, poolsize=poolsize1) c_layer2 = LeNetConvPoolLayer(input=c_layer1.output, filter_shape=filter_shape2, image_shape=image_shape2,W=c_w2, b=c_b2, poolsize=poolsize2) c_layer3 = LeNetConvPoolLayer(input=c_layer2.output, filter_shape=filter_shape3, image_shape=image_shape3,W=c_w3, b=c_b3, poolsize=poolsize3) m_input = c_layer3.output m_input = m_input.flatten(2) m_layer1 = HiddenLayer(m_input, W=m_w1, b=m_b1) s_layer = LogisticRegression(m_layer1.output, W=o_w1, b=o_b1) self.cost= s_layer.negative_log_likelihood(y)
def get_by_name(name: str, dataset: AbstractDataset) -> nn.Module: name = name.lower() if name == ModelType.LOGISTIC.name.lower(): return LogisticRegression(dataset) elif name == ModelType.MLP.name.lower(): return MLP(dataset) elif name == ModelType.VGG.name.lower(): return Vgg(dataset)
def __init__(self, input, n_in, n_hidden, n_out, n_layers, n_total, batch, mask): # adjust the input input = input.dimshuffle(1, 0, 2) # hidden layers self.params = [] self.hiddenLayers = [] self.velo = [] input_list = [] input_list.append(input) input_list.append(input[::-1]) self.hiddenLayers.append( HiddenLayer(input_list=input_list, n_in=n_in, n_out=n_hidden, BATCH=batch)) self.params.extend(self.hiddenLayers[0].params) self.velo.extend(self.hiddenLayers[0].velo) for i in range(1, n_layers): self.hiddenLayers.append( HiddenLayer(input_list=self.hiddenLayers[i - 1].output_list, n_in=n_hidden, n_out=n_hidden, BATCH=batch)) self.params.extend(self.hiddenLayers[i].params) self.velo.extend(self.hiddenLayers[i].velo) # output layer self.logRegressionLayer = LogisticRegression( input_list=self.hiddenLayers[n_layers - 1].output_list, n_in=n_hidden, n_out=n_out, n_total=n_total, mask=mask, batch=batch) self.params.extend(self.logRegressionLayer.params) self.velo.extend(self.logRegressionLayer.velo) # L1 regularization l1_sum = 0 for layer in self.hiddenLayers: l1_sum += abs(layer.W2).sum() + abs(layer.W1).sum() + abs( layer.U1).sum() + abs(layer.U2).sum() self.L1 = l1_sum + abs(self.logRegressionLayer.W).sum() # L2 squared regularization l2_sum = 0 for layer in self.hiddenLayers: l2_sum += abs(layer.W2**2).sum() + abs(layer.W1**2).sum() + abs( layer.U1**2).sum() + abs(layer.U2**2).sum() self.L2_sqr = l2_sum + (self.logRegressionLayer.W**2).sum() + ( self.logRegressionLayer.M**2).sum() # negative log likelihood self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # errors self.errors = self.logRegressionLayer.errors # predict self.y_pred = self.logRegressionLayer.y_pred
def __init__(self, rng, _input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will # translate into a TanhLayer connected to the LogisticRegression # layer; this can be replaced by a SigmoidalLayer, or a layer # implementing any other nonlinearity self.hiddenLayer = HiddenLayer(rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( _input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = abs(self.hiddenLayer.W).sum() \ + abs(self.logRegressionLayer.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \ + (self.logRegressionLayer.W ** 2).sum() # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params
def __init__(self, input, rng, n_in, n_out, n_hidden): self.hidden = HiddenLayer( input=input, rng=rng, n_in=n_in, n_out=n_hidden, ) self.logistic_reg = LogisticRegression(input=self.hidden.output, n_in=n_hidden, n_out=n_out)
def __init__(self,rng,input,n_in,n_h,n_out): self.hidden_layer = HiddenLayer(rng,input=input,n_in=n_in,n_out=n_h) self.output_layer = LogisticRegression(input=self.hidden_layer.output, n_in=n_h,n_out=n_out) #regularization self.L1 = abs(self.hidden_layer.w).sum() + abs(self.output_layer.w).sum() self.L2 = (self.hidden_layer.w**2).sum() + (self.output_layer.w**2).sum() # Negative Log Likelihood self.neg_log_likelihood = (self.output_layer.neg_log_likelihood) # errors function self.errors = (self.output_layer.errors) # params self.params = self.hidden_layer.params + self.output_layer.params self.input = input
def __init__(self, input=None, y=None, Cparams=None, Mparams=None): c_w0, c_b0, c_w1, c_b1, c_w2, c_b2, c_w3, c_b3 = Cparams m_w1, m_b1, o_w1, o_b1 = Mparams c_layer0 = LeNetConvPoolLayer(input=input, filter_shape=filter_shape0, image_shape=image_shape0, W=c_w0, b=c_b0, poolsize=poolsize0) c_layer1 = LeNetConvPoolLayer(input=c_layer0.output, filter_shape=filter_shape1, image_shape=image_shape1, W=c_w1, b=c_b1, poolsize=poolsize1) c_layer2 = LeNetConvPoolLayer(input=c_layer1.output, filter_shape=filter_shape2, image_shape=image_shape2, W=c_w2, b=c_b2, poolsize=poolsize2) c_layer3 = LeNetConvPoolLayer(input=c_layer2.output, filter_shape=filter_shape3, image_shape=image_shape3, W=c_w3, b=c_b3, poolsize=poolsize3) m_input = c_layer3.output m_input = m_input.flatten(2) m_layer1 = HiddenLayer(m_input, W=m_w1, b=m_b1) s_layer = LogisticRegression(m_layer1.output, W=o_w1, b=o_b1) self.cost = s_layer.negative_log_likelihood(y)
def __init__(self, input, n_in, n_hidden, n_out, n_layers): # hidden layers self.params = [] self.hiddenLayers = [] self.velo = [] self.hiddenLayers.append( HiddenLayer(input=input, n_in=n_in, n_out=n_hidden, activation=a.relu)) self.params.extend(self.hiddenLayers[0].params) self.velo.extend(self.hiddenLayers[0].velo) for i in range(1, n_layers): self.hiddenLayers.append( HiddenLayer(input=self.hiddenLayers[i - 1].output, n_in=n_hidden, n_out=n_hidden, activation=a.relu)) self.params.extend(self.hiddenLayers[i].params) self.velo.extend(self.hiddenLayers[i].velo) # output layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayers[n_layers - 1].output, n_in=n_hidden, n_out=n_out) self.params.extend(self.logRegressionLayer.params) self.velo.extend(self.logRegressionLayer.velo) # L1 regularization l1_sum = 0 for layer in self.hiddenLayers: l1_sum += abs(layer.W).sum() self.L1 = l1_sum + abs(self.logRegressionLayer.W).sum() # L2 squared regularization l2_sum = 0 for layer in self.hiddenLayers: l2_sum += (layer.W**2).sum() self.L2_sqr = l2_sum + (self.logRegressionLayer.W**2).sum() # negative log likelihood self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood) # errors self.errors = self.logRegressionLayer.errors # predict self.y_pred = self.logRegressionLayer.y_pred self.output = self.logRegressionLayer.y.T
def __init__(self, input=None, Cparams=None, Mparams=None): c_w0, c_b0, c_w1, c_b1, c_w2, c_b2, c_w3, c_b3 = Cparams m_w1, m_b1, o_w1, o_b1 = Mparams c1_layer0 = LeNetConvPoolLayer(input=input, filter_shape=filter_shape0, image_shape=image_shape0, W=c_w0, b=c_b0, poolsize=poolsize0) c1_layer1 = LeNetConvPoolLayer(input=c1_layer0.output, filter_shape=filter_shape1, image_shape=image_shape1, W=c_w1, b=c_b1, poolsize=poolsize1) c1_layer2 = LeNetConvPoolLayer(input=c1_layer1.output, filter_shape=filter_shape2, image_shape=image_shape2, W=c_w2, b=c_b2, poolsize=poolsize2) c1_layer3 = LeNetConvPoolLayer(input=c1_layer2.output, filter_shape=filter_shape3, image_shape=image_shape3, W=c_w3, b=c_b3, poolsize=poolsize3) m_input = c1_layer3.output m_input = m_input.flatten(2) m_layer1 = HiddenLayer(m_input, W=m_w1, b=m_b1) s_layer = LogisticRegression(m_layer1.output, W=o_w1, b=o_b1) #self.y_pred= s_layer.getlabel() self.y_pred = c1_layer3.output.flatten(1)
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='../data/mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, nkerns=[20, 50]): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels batch_size=500 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. rng = numpy.random.RandomState(23455) self.layer0_input = self.x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) self.layer0 = LeNetConvPoolLayer(rng, input=self.layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) self.layer1 = LeNetConvPoolLayer(rng, input=self.layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) self.layer2_input = self.layer1.output.flatten(2) self.layer2=HiddenLayer(rng,input=self.layer2_input,n_in=nkerns[1]*4*4,n_out=500,activation=T.tanh) for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = 500 else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.layer2.output else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, nkerns=[20, 50]): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels batch_size=500 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. rng = numpy.random.RandomState(23455) self.layer0_input = self.x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) self.layer0 = LeNetConvPoolLayer(rng, input=self.layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) self.layer1 = LeNetConvPoolLayer(rng, input=self.layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) self.layer2_input = self.layer1.output.flatten(2) self.layer2=HiddenLayer(rng,input=self.layer2_input,n_in=nkerns[1]*4*4,n_out=500,activation=T.tanh) for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = 500 else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.layer2.output else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function(inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function(inputs=[index], outputs=self.finetune_cost, updates=updates, givens={self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) test_score_i = theano.function([index], self.errors, givens={self.x: test_set_x[index * batch_size: (index + 1) * batch_size], self.y: test_set_y[index * batch_size: (index + 1) * batch_size]}) valid_score_i = theano.function([index], self.errors, givens={self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
# Store predictions at the right positions in the result vector. predictions[cat_indices_te] = predictions_cat.reshape( predictions[cat_indices_te].shape) return predictions if __name__ == "__main__": # Import data y_train, x_train, ids_train = helper.load_csv_data('train.csv') y_test, x_test, ids_test = helper.load_csv_data('test.csv') y_train[y_train < 0] = 0 # Define 1 model per category models = [ LogisticRegression(degree=3, gamma=0.1), LogisticRegression(degree=6, gamma=0.1), LogisticRegression(degree=6, gamma=0.1), LogisticRegression(degree=6, gamma=0.1) ] # Train and predict predictions = train_predict_categories(y_train, x_train, x_test, *models) # Prepare for export predictions[predictions == 0] = -1 # Export results helper.create_csv_submission(ids_test, predictions, 'predictions.csv')
# In[ ]: ## Layer 2 : Hidden Layer setup ## # layer1 output shape : batch_sizex50x4x4 # layer2_h input shape req : batch_size x (50*4*4) layer2_h_input = layer1.output.flatten(2) # n_in = 50x4x4 pixels; n_out = 500 hidden nodes layer2_h = HiddenLayer(rng=rng,input=layer2_h_input,n_in=50*4*4,n_out=500) # In[ ]: # Layer 3 : Output layer : LogisticRegression layer3_o = LogisticRegression(input=layer2_h.output,n_in=500,n_out=10) # In[ ]: # cost cost = layer3_o.neg_log_likelihood(y) # >> setup gradient expression << ### Need :parameters params = layer3_o.params + layer2_h.params + layer1.params + layer0.params gparams = T.grad(cost,params) # In[ ]: ## Updates ##
def test_cnn(trainpath, trainlist, validset, dumppath, learning_rate=0.01, n_epochs=200, batch_size=100, earlystop=True): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(123) # datasets = load_data(dataset) datasets = loadmat(trainpath=trainpath, trainlist=trainlist, validset=validset, shuffle=shuffle, datasel=datasel, scaling=scaling, robust=robust) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # H - height; W - width # when the input is note salience matrix # idim0_H = 42 # idim0_W = 36 # fdim0_H = 6 # fdim0_W = 6 # when the input is chromagram idim0_H = 12 idim0_W = 12 fdim0_H = 2 fdim0_W = 2 pdim0_H = 2 pdim0_W = 2 idim1_H = (idim0_H - fdim0_H + 1) / pdim0_H idim1_W = (idim0_W - fdim0_W + 1) / pdim0_W fdim1_H = 2 fdim1_W = 2 pdim1_H = 2 pdim1_W = 2 idim2_H = (idim1_H - fdim1_H + 1) / pdim1_H idim2_W = (idim1_W - fdim1_W + 1) / pdim1_W fdim2 = 800 nkerns = [20, 20] # the below comments are examples of using this cnn to deal with chromagram with input feature size 144 = 12*12 # Reshape matrix of rasterized images of shape (batch_size, 12 * 12) # to a 4D tensor, compatible with our ConvPoolLayer # (12, 12) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, idim0_H, idim0_W)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (12-2+1 , 12-2+1) = (11, 11) # maxpooling reduces this further to (11/2, 11/2) = (5, 5) # 4D output tensor is thus of shape (batch_size, nkerns[0], 5, 5) layer0 = ConvPoolLayer(rng, input=layer0_input, input_shape=(batch_size, 1, idim0_H, idim0_W), filter_shape=(nkerns[0], 1, fdim0_H, fdim0_W), poolsize=(pdim0_H, pdim0_W)) # Construct the second convolutional pooling layer # filtering reduces the image size to (5-2+1, 5-2+1) = (4, 4) # maxpooling reduces this further to (4/2, 4/2) = (2, 2) # 4D output tensor is thus of shape (batch_size, nkerns[1], 2, 2) layer1 = ConvPoolLayer(rng, input=layer0.output, input_shape=(batch_size, nkerns[0], idim1_H, idim1_W), filter_shape=(nkerns[1], nkerns[0], fdim1_H, fdim1_W), poolsize=(pdim1_H, pdim1_W)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 2 * 2), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * idim2_H * idim2_W, n_out=fdim2, activation=T.nnet.relu) # classify the values of the fully-connected sigmoidal layer nclass = max(train_set_y.eval()) + 1 layer3 = LogisticRegression(input=layer2.output, n_in=fdim2, n_out=nclass) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) train_score = theano.function( [index], layer3.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.996 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 training_history = [] start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs): if earlystop and done_looping: print 'early-stopping' break epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] #training_losses = [train_score(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) #this_training_loss = numpy.mean(training_losses) #training_history.append([iter,this_training_loss,this_validation_loss]) training_history.append([iter, this_validation_loss]) # print('epoch %i, minibatch %i/%i, training error %f %%' % # (epoch, minibatch_index + 1, n_train_batches, # this_training_loss * 100.)) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) print('iter = %d' % iter) print('patience = %d' % patience) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) numpy.savez(dumppath, model=params, training_history=training_history, best_validation_loss=best_validation_loss) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter print('best_validation_loss %f' % best_validation_loss) if patience <= iter: done_looping = True if earlystop: break end_time = timeit.default_timer() # final save numpy.savez(dumppath, model=params, training_history=training_history, best_validation_loss=best_validation_loss) print(('Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ') % (best_validation_loss * 100., best_iter + 1)) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=200, dataset='../testnn.mat', nkerns=[20, 20], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(123) # datasets = load_data(dataset) datasets = loadmat(dataset=dataset, shuffle=shuffle, datasel=datasel, scaling=scaling, robust=robust) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # the below comments are examples of using this cnn to deal with MNIST with input feature size 784 = 28*28 # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, idim0_H, idim0_W)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, idim0_H, idim0_W), filter_shape=(nkerns[0], 1, fdim0_H, fdim0_W), poolsize=(pdim0_H, pdim0_W) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], idim1_H, idim1_W), filter_shape=(nkerns[1], nkerns[0], fdim1_H, fdim1_W), poolsize=(pdim1_H, pdim1_W) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * idim2_H * idim2_W, n_out=fdim2, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer nclass = max(train_set_y.eval()) + 1 layer3 = LogisticRegression(input=layer2.output, n_in=fdim2, n_out=nclass) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) train_score = theano.function( [index], layer3.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs): if earlystop and done_looping: print 'early-stopping' break epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] training_losses = [train_score(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) this_training_loss = numpy.mean(training_losses) print('epoch %i, minibatch %i/%i, training error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_training_loss * 100.)) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True if earlystop: break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, L1_reg=0, L2_reg=0, first_layer='grbm',model=None): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.L1 = 0 self.L2_sqr = 0 assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[i - 1].output if model is None: W = None b = None else: W = model[i*2] b = model[i*2 + 1] sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], W = W, b = b, activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.L1 += (abs(sigmoid_layer.W).sum()) self.L2_sqr += ((sigmoid_layer.W ** 2).sum()) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer if i == 0: # first layer GBRBM - dealing with continous value if first_layer == 'grbm': rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) if first_layer == 'rbm': rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) # elif i == self.n_layers-1: # last layer GGRBM # rbm_layer = GRBM(numpy_rng=numpy_rng, # theano_rng=theano_rng, # input=layer_input, # n_visible=input_size, # n_hidden=hidden_layers_sizes[i], # W=sigmoid_layer.W, # hbias=sigmoid_layer.b) else: # subsequence layers BBRBM - binary RBM to cope with regularization rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if model is None: W = None b = None else: W = model[-2] b = model[-1] self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], W = W, b = b, n_out=n_outs) self.params.extend(self.logLayer.params) self.L1 += (abs(self.logLayer.W).sum()) self.L2_sqr += ((self.logLayer.W ** 2).sum()) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = (self.logLayer.negative_log_likelihood(self.y) + + L1_reg * self.L1 + L2_reg * self.L2_sqr ) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.predprobs = self.logLayer.p_y_given_x self.preds = self.logLayer.y_pred
class SdA(object): """Stacked denoising autoencoder class (sdA) A stacked denoising autoencoder mode is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the sdA. and the hidden layer of the last dA represents the output. Note that after pretraining, the sdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__(self, numpy_rng, theano_rng=None, n_in=784, hidden_layers_sizes=[500, 500], n_out=10,corruption_levels=[0.,0.1]): """This class is made to support a variable number of layers. :type theano_rng: theano.tensor.shared_randomstream.RandomSteam :param theano_rng: Thenao random generator used to draw initial weights :type n_in: int :param n_in: dimension of the input to the sdA :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_out: int :param n_out: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ # self.dA_layers will store the denoising autoencoder associated # with the layers of the MLP self.dA_layers = [] # self.sigmoid_layers will store the sigmoid layers of the MLP facade self.sigmoid_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasteried images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # # Construct self.n_layers sigmoid layers and self.n_layers denoising # layersm where self.n_layers is the depth of our model # for i in range(self.n_layers): # construct a sigmoid layer # # the size of the input is ethier the number of the hidden units of # the layer below or the input size if we are on the first layer. # the input of the layer has the same situation if i == 0: input_size = n_in layer_input = self.x else: input_size = hidden_layers_sizes[i-1] layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layer self.sigmoid_layers.append(sigmoid_layer) # ??? the parameters of the sigmoid layers are paremeters of the # sdA, the visible bias in the dA are parameters of those # dA, but not the sdA. So we do not add the dA_layer's (below) # bvis to self.params. self.params.extend(sigmoid_layer.params) # construct a denoising autoencoder that shared weights with this # sigmoid_layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W = sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # # Construct a logistic layer on top of the MLP # self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_out ) self.params.extend(self.logLayer.params) # # Construct a function that impletements one step of finetuning # # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def preTraining_functions(self, trainSetX, batch_size): ''' Generates a list of functions, each of them implementing one step in training the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type trainSetX: theano.tensor.TensorType :param trainSetX: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a minibatch ''' index = T.lscalar('index') corruption_level = T.scalar('corruption') learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: cost, updates = dA.get_cost_updates(corruption_level,learning_rate) # compile the theano function fn = theano.function( inputs=[index, theano.Param(corruption_level,default=0.2), theano.Param(learning_rate,default=0.1)], outputs=cost, updates=updates, givens={ self.x: trainSetX[batch_begin:batch_end] } ) pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; that as to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two theano variables, one for the datapoints, the other for labels :type batch_size: int :param batch_size: learning_rate used during finetuns stage :type learning_rate: float :param learning_rate: learning_rate used during finetune stage ''' trainSetX, trainSetY = datasets[0] validSetX, validSetY = datasets[1] testSetX, testSetY = datasets[2] n_valid_batches = validSetX.get_value(borrow=True).shape[0] // batch_size n_test_batches = testSetX.get_value(borrow=True).shape[0] // batch_size index = T.lscalar('index') batch_begin = index * batch_size batch_end = batch_begin + batch_size gparams = T.grad(self.finetune_cost, self.params) updates = [(param, param - learning_rate*gparam) for param, gparam in zip(self.params, gparams)] train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x :trainSetX[batch_begin:batch_end], self.Y :trainSetY[batch_begin:batch_end] }, name='train' ) valid_score_i = theano.function( inputs=[index], outputs=self.errors, givens={ self.x :validSetX[batch_begin:batch_end], self.Y :validSetY[batch_begin:batch_end] }, name='valid' ) test_score_i = theano.function( inputs=[index], outputs=self.errors, givens={ self.x :testSetX[batch_begin:batch_end], self.Y :testSetY[batch_begin:batch_end] }, name='test' ) def valid_score(): return [valid_score_i(i) for i in range(n_valid_batches)] def test_score(): return [test_score_i(i) for i in range(n_test_batches)] return train_fn, valid_score, test_score
def solve_CNN(datapath, batch=500, n_hidden=5, n_out=10, n_epoch=3, learning_rate=0.54): x = T.dmatrix('x') y = T.ivector('y') index = T.iscalar('index') kernal = (50, 30) cifar_data = upload() train, test = cifar_data print 'data being converted to theano-shared............ ' train_x, train_y = to_shared(train) test_x, test_y = to_shared(test) n_train_batch = train[0].shape[0] // batch n_valid_batch = test[0].shape[0] // batch rng = np.random.RandomState(123) layer0_input = x.reshape((batch, 3, 32, 32)) layer0 = ConvPoolLayer( input=layer0_input, rng=rng, filter_shape=(kernal[0], 3, 5, 5), ) layer1 = ConvPoolLayer(input=layer0.output, rng=rng, filter_shape=(kernal[1], kernal[0], 5, 5)) layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer( input=layer2_input, rng=rng, n_out=n_hidden, n_in=kernal[1] * 5 * 5, ) layer3 = LogisticRegression(input=layer2.output, n_in=n_hidden, n_out=n_out) fun_valid = theano.function( inputs=[index], outputs=layer3.error(y), givens=[(x, test_x[index * batch:(index + 1) * batch, :]), (y, test_y[index * batch:(index + 1) * batch])]) cost = layer3.negative_log_likelihood(y) params = layer0.params + layer1.params + layer2.params + layer3.params grad_all = T.grad(cost=cost, wrt=params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grad_all)] fun_train = theano.function( inputs=[index], outputs=[], updates=updates, givens=[(x, train_x[index * batch:(index + 1) * batch, :]), (y, train_y[index * batch:(index + 1) * batch])]) ################ #TRAINING MODEL# ################.......................................... print 'training starts now -->' patience = 5000 patience_increase = 2 improvement = 0.995 validation_frequency = min(n_train_batch, patience // 2) least_error = np.Inf epoch = 0 done_looping = False this_error = 0 start_time = timeit.default_timer() print 'EPOCH counting .....' while epoch < n_epoch and (not done_looping): for current_batch in range(n_train_batch): total_batches = epoch * n_train_batch + current_batch fun_train(current_batch) if (total_batches + 1) % validation_frequency == 0: this_error = [fun_valid(n) for n in range(n_valid_batch)] this_error = np.mean(this_error) print this_error if this_error < least_error * improvement: least_error = this_error patience = max(patience, total_batches * patience_increase) #with open('/home/sameer/best_model_neural_filters.pkl', 'wb') as f: # pickle.dump(layer0.params, f) # f.close() if total_batches > patience: done_looping = True epoch += 1 if total_batches != 0: #print 'the convergence ratio is %f' %(patience/float(total_batches)) print this_error print epoch save[epoch] = this_error print 'the error is %f' % least_error print 'the total number of epoch %d' % epoch end_time = timeit.default_timer() t = end_time - start_time print 'total time = %f sec' % t print 'time per epoch = %f sec/epoch' % (t / epoch)
class SdA(object): """Stacked denoising auto-encoder class (SdA) A stacked denoising autoencoder model is obtained by stacking several dAs. The hidden layer of the dA at layer `i` becomes the input of the dA at layer `i+1`. The first layer dA gets as input the input of the SdA, and the hidden layer of the last dA represents the output. Note that after pretraining, the SdA is dealt with as a normal MLP, the dAs are only used to initialize the weights. """ def __init__( self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1] ): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs ) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size): ''' Generates a list of functions, each of them implementing one step in trainnig the dA corresponding to the layer with same index. The function will require as input the minibatch index, and to train a dA you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared variable that contains all datapoints used for training the dA :type batch_size: int :param batch_size: size of a [mini]batch :type learning_rate: float :param learning_rate: learning rate used during training for any of the dA layers ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: # get the cost and the updates list cost, updates = dA.get_cost_updates(corruption_level, learning_rate) # compile the theano function fn = theano.function( inputs=[ index, theano.Param(corruption_level, default=0.2), theano.Param(learning_rate, default=0.1) ], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin: batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [ (param, param - gparam * learning_rate) for param, gparam in zip(self.params, gparams) ] train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] }, name='train' ) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] }, name='test' ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] }, name='valid' ) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
X_test = None Y_test = None print("Non 0-1 labels removed from testing dataset!") print("\nTraining SVM on MNIST dataset...") svm = SupportVectorMachine() svm.train(X, Y, 1) print("SVM trained!") print("\nTraining Linear Regression on MNIST dataset...") linear = LinearRegression() linear.train(X, Y) print("Linear regression trained!") print("\nTraining Logistic Regression on MNIST dataset...") logistic = LogisticRegression() logistic.train(X, Y) print("Logistic regression trained!") # Test SVM print("\nRunning SVM on test data...") misclassified = svm.test(X2, Y2) print("Generalization Error:", round(misclassified/Y2.size, 3)) print("Misclassified:", misclassified, "/", Y2.size) print("Accuracy (on test data):", round((1 - (misclassified/Y2.size)) * 100, 3), '%') # Test Linear Regression print("\nRunning Linear Regression on test data...") misclassified = linear.test(X2, Y2) print("Generalization Error:", round(misclassified/Y2.size, 3)) print("Misclassified:", misclassified, "/", Y2.size)
def solve_CNN(datapath, batch = 500,n_hidden = 5,n_out = 10,n_epoch = 3,learning_rate = 0.54): x = T.dmatrix('x') y = T.ivector('y') index = T.iscalar('index') kernal = (50,30) cifar_data = upload() train, test = cifar_data print 'data being converted to theano-shared............ ' train_x, train_y = to_shared(train) test_x, test_y = to_shared(test) n_train_batch = train[0].shape[0] // batch n_valid_batch = test[0].shape[0] // batch rng = np.random.RandomState(123) layer0_input = x.reshape((batch,3,32,32)) layer0 = ConvPoolLayer(input = layer0_input, rng = rng, filter_shape = (kernal[0],3,5,5), ) layer1 = ConvPoolLayer(input = layer0.output, rng = rng, filter_shape = (kernal[1],kernal[0],5,5)) layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer(input = layer2_input, rng = rng, n_out = n_hidden, n_in = kernal[1]*5*5, ) layer3 = LogisticRegression(input = layer2.output, n_in = n_hidden, n_out = n_out) fun_valid = theano.function(inputs = [index], outputs = layer3.error(y), givens = [(x,test_x[index*batch:(index+1)*batch,:]), (y,test_y[index*batch:(index+1)*batch])] ) cost = layer3.negative_log_likelihood(y) params = layer0.params + layer1.params + layer2.params + layer3.params grad_all = T.grad(cost = cost, wrt = params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grad_all)] fun_train = theano.function(inputs = [index], outputs = [], updates = updates, givens = [(x,train_x[index*batch:(index+1)*batch,:]), (y,train_y[index*batch:(index+1)*batch])] ) ################ #TRAINING MODEL# ################.......................................... print 'training starts now -->' patience = 5000 patience_increase = 2 improvement = 0.995 validation_frequency = min(n_train_batch, patience//2) least_error = np.Inf epoch = 0 done_looping = False this_error = 0 start_time = timeit.default_timer() print 'EPOCH counting .....' while epoch < n_epoch and (not done_looping): for current_batch in range(n_train_batch): total_batches = epoch*n_train_batch + current_batch fun_train(current_batch) if (total_batches+1) % validation_frequency == 0: this_error = [fun_valid(n) for n in range(n_valid_batch)] this_error = np.mean(this_error) print this_error if this_error < least_error*improvement: least_error = this_error patience = max(patience,total_batches * patience_increase) #with open('/home/sameer/best_model_neural_filters.pkl', 'wb') as f: # pickle.dump(layer0.params, f) # f.close() if total_batches > patience: done_looping = True epoch += 1 if total_batches != 0: #print 'the convergence ratio is %f' %(patience/float(total_batches)) print this_error print epoch save[epoch] = this_error print 'the error is %f' %least_error print 'the total number of epoch %d' %epoch end_time = timeit.default_timer() t = end_time - start_time print 'total time = %f sec' %t print 'time per epoch = %f sec/epoch' %(t/epoch)
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='data/mnist.pkl.gz', batch_size=600): training_set, validation_set, testing_set, = data_loader.load(dataset) training_set_x, training_set_y = training_set validation_set_x, validation_set_y = validation_set testing_set_x, testing_set_y = testing_set # compute number of minibatches for training, validation and testing n_train_batches = training_set_x.get_value( borrow=True).shape[0] / batch_size n_valid_batches = validation_set_x.get_value( borrow=True).shape[0] / batch_size n_test_batches = testing_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = tensor.lscalar() # generate symbolic variables for input (x and y represent a # minibatch) x = tensor.matrix('x') y = tensor.ivector('y') classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: testing_set_x[index * batch_size:(index + 1) * batch_size], y: testing_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: validation_set_x[index * batch_size:(index + 1) * batch_size], y: validation_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = tensor.grad(cost=cost, wrt=classifier.W) g_b = tensor.grad(cost=cost, wrt=classifier.b) # update the parameters of the model updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: training_set_x[index * batch_size:(index + 1) * batch_size], y: training_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is considered significant validation_frequency = 5 * n_train_batches # requency of training best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iter: number of minibatches used) iter = epoch * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # update best_validation_loss best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of' ' best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) # save the best model with open('best_model.pkl', 'w') as f: cPickle.dump(classifier, f) if patience <= iter: done_looping = True break epoch = epoch + 1 end_time = timeit.default_timer() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time))
features.append((float(col_1), float(col_2), float(col_3), float(col_4), float(col_5), float(col_6))) categories.append(int(col_7)) return features, categories if __name__ == "__main__": train_x, train_y = getInput('./dataForTrainingLogistic.txt') test_x, test_y = getInput('./dataForTestingLogistic.txt') train_x = np.hstack((np.array(train_x), np.ones((len(train_x), 1)))) test_x = np.hstack((np.array(test_x), np.ones(((len(test_x), 1))))) train_y = np.array(train_y).reshape(len(train_y)) test_y = np.array(test_y).reshape(len(test_y)) lr = LogisticRegression(learning_rate=0.00015, initial_w=np.zeros(train_x.shape[1])) # batch gradient descent # history_loss, history_test_loss, history_score,_ = lr.train_gradient_descent( # epoch=150000, epoch_per_round=10000, train_x=train_x, train_y=train_y, test_x=test_x, test_y=test_y) # stochastic gradient descent history_loss, history_test_loss, history_score, _ = lr.train_stochastic_gradient_descent( iteration_num=500000, iter_per_round=100, batch_size=1, train_x=train_x, train_y=train_y, test_x=test_x, test_y=test_y) print('Coefficient:', lr.w) variable_x = range(100, 500001, 100)
def __init__(self, rng, input, n_in, hidden_layers_sizes, n_out, model=None): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type hidden_layers_sizes: int list :param n_hidden: number of hidden units in each hidden layer :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ self.n_layers = len(hidden_layers_sizes) self.hiddenlayers = [] self.params = [] self.L1 = 0 self.L2_sqr = 0 # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function for i in xrange(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = input else: layer_input = self.hiddenlayers[i - 1].output if model is None: W = None b = None else: W = model[i * 2] b = model[i * 2 + 1] hiddenLayer = HiddenLayer(rng=rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], W=W, b=b, activation=T.nnet.sigmoid) self.hiddenlayers.append(hiddenLayer) self.params.extend(hiddenLayer.params) self.L1 += (abs(hiddenLayer.W).sum()) self.L2_sqr += ((hiddenLayer.W**2).sum()) # The logistic regression layer gets as input the hidden units # of the hidden layer if model is None: W = None b = None else: W = model[-2] b = model[-1] self.logRegressionLayer = LogisticRegression( input=self.hiddenlayers[-1].output, n_in=hidden_layers_sizes[-1], W=W, b=b, n_out=n_out) # end-snippet-2 start-snippet-3 # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 += (abs(self.logRegressionLayer.W).sum()) self.L2_sqr += ((self.logRegressionLayer.W**2).sum()) # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood) # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors self.predprobs = self.logRegressionLayer.p_y_given_x self.preds = self.logRegressionLayer.y_pred # the parameters of the model are the parameters of the two layer it is # made out of self.params.extend(self.logRegressionLayer.params) # end-snippet-3 # keep track of model input self.input = input
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, L1_reg=0, L2_reg=0, first_layer='grbm', model=None): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.L1 = 0 self.L2_sqr = 0 assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[i - 1].output if model is None: W = None b = None else: W = model[i * 2] b = model[i * 2 + 1] sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], W=W, b=b, activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.L1 += (abs(sigmoid_layer.W).sum()) self.L2_sqr += ((sigmoid_layer.W**2).sum()) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer if i == 0: # first layer GBRBM - dealing with continous value if first_layer == 'grbm': rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) if first_layer == 'rbm': rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) # elif i == self.n_layers-1: # last layer GGRBM # rbm_layer = GRBM(numpy_rng=numpy_rng, # theano_rng=theano_rng, # input=layer_input, # n_visible=input_size, # n_hidden=hidden_layers_sizes[i], # W=sigmoid_layer.W, # hbias=sigmoid_layer.b) else: # subsequence layers BBRBM - binary RBM to cope with regularization rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if model is None: W = None b = None else: W = model[-2] b = model[-1] self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], W=W, b=b, n_out=n_outs) self.params.extend(self.logLayer.params) self.L1 += (abs(self.logLayer.W).sum()) self.L2_sqr += ((self.logLayer.W**2).sum()) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = (self.logLayer.negative_log_likelihood(self.y) + +L1_reg * self.L1 + L2_reg * self.L2_sqr) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.predprobs = self.logLayer.p_y_given_x self.preds = self.logLayer.y_pred
def __init__(self, input_dim, nb_experts, output_dim): self.nb_experts = nb_experts self.output_dim = output_dim self.gates = LogisticRegression(input_dim, nb_experts) self.experts = [LogisticRegression(input_dim, output_dim) for k in range(nb_experts)]
# -*- coding: utf-8 -*- """ Created on Mon Sep 4 17:29:50 2017 @author: heman """ from request_data_link import get import numpy as np from logistic import LogisticRegression link = 'http://data.princeton.edu/wws509/datasets/copen.raw' m, n, parsed_data = get(link, 6) index = list(range(0, parsed_data.size, 6)) parsed_data = np.delete(parsed_data, index) index = list(range(4, parsed_data.size, 5)) targets = parsed_data[index] data = np.delete(parsed_data, index).reshape(m, n - 1) del link, m, n, parsed_data, index regr = LogisticRegression() regr.train(data, targets, iter=1000000, step=0.001, lamda=0) labels, predictions = regr.test(data, targets)
from mnist import MNIST mndata = MNIST('./MNIST') trImg, trLab = mndata.load_training() teImg, teLab = mndata.load_testing() trImg = np.asanyarray(trImg) trLab = np.asanyarray(trLab) teImg = np.asanyarray(teImg) teLab = np.asanyarray(teLab) usps = LoadUSPS.LoadUSPS('proj3_images.zip') uspsImg, uspsLab = usps.load() #1> logistic Regression logistic = LogisticRegression(28 * 28, 10) logistic.train(trImg, trLab, lr = 0.3) accuracy = logistic.test(teImg, teLab) uspsacc = logistic.test(uspsImg, uspsLab) print('logisticregression accuracy :', accuracy, uspsacc) #grid search for best learning rate performance #for lr in [0.5, 0.3, 0.1, 0.05, 0.01]: # logistic.train(trImg, trLab, lr = 0.1) # accuracy = logistic.test(teImg, teLab) # print(lr, accuracy) #2> Multilayer perceptron implementation using tensorflow mlp = MLP.MLP()
def __init__( self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1] ): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs ) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
import numpy as np import matplotlib.pyplot as plt from logistic import LogisticRegression # read data X = np.loadtxt('logistic_x.txt') y = np.loadtxt('logistic_y.txt') # build model lr = LogisticRegression() lr.fit(X, y) y_ = lr.predict(X) # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 h = 0.1 # step_size xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) data = np.vstack((xx.ravel(), yy.ravel())).T labels = lr.predict(data) # plot fig, ax = plt.subplots() ax.scatter(data[:, 0], data[:, 1], c=np.where(labels == 1, 'green', 'red'), alpha=0.01) plt.title('Decision Boundary of Logistic Regression') ax.scatter(X[y == 1, 0], X[y == 1, 1], c='green',
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='data/mnist.pkl.gz', batch_size=600): training_set, validation_set, testing_set, = data_loader.load(dataset) training_set_x , training_set_y = training_set validation_set_x, validation_set_y = validation_set testing_set_x , testing_set_y = testing_set # compute number of minibatches for training, validation and testing n_train_batches = training_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = validation_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = testing_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = tensor.lscalar() # generate symbolic variables for input (x and y represent a # minibatch) x = tensor.matrix('x') y = tensor.ivector('y') classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: testing_set_x[index * batch_size: (index + 1) * batch_size], y: testing_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: validation_set_x[index * batch_size: (index + 1) * batch_size], y: validation_set_y[index * batch_size: (index + 1) * batch_size] } ) # compute the gradient of cost with respect to theta = (W,b) g_W = tensor.grad(cost=cost, wrt=classifier.W) g_b = tensor.grad(cost=cost, wrt=classifier.b) # update the parameters of the model updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: training_set_x[index * batch_size: (index + 1) * batch_size], y: training_set_y[index * batch_size: (index + 1) * batch_size] } ) ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is considered significant validation_frequency = 5 * n_train_batches # requency of training best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iter: number of minibatches used) iter = epoch * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # update best_validation_loss best_validation_loss = this_validation_loss # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print( ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100. ) ) # save the best model with open('best_model.pkl', 'w') as f: cPickle.dump(classifier, f) if patience <= iter: done_looping = True break epoch = epoch + 1 end_time = timeit.default_timer() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time))
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, L1_reg=0, L2_reg=0, first_layer='grbm',model=None): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.L1 = 0 self.L2_sqr = 0 assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[i - 1].output if model is None: W = None b = None else: W = model[i*2] b = model[i*2 + 1] sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], W = W, b = b, activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.L1 += (abs(sigmoid_layer.W).sum()) self.L2_sqr += ((sigmoid_layer.W ** 2).sum()) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer if i == 0: # first layer GBRBM - dealing with continous value if first_layer == 'grbm': rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) if first_layer == 'rbm': rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) # elif i == self.n_layers-1: # last layer GGRBM # rbm_layer = GRBM(numpy_rng=numpy_rng, # theano_rng=theano_rng, # input=layer_input, # n_visible=input_size, # n_hidden=hidden_layers_sizes[i], # W=sigmoid_layer.W, # hbias=sigmoid_layer.b) else: # subsequence layers BBRBM - binary RBM to cope with regularization rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if model is None: W = None b = None else: W = model[-2] b = model[-1] self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], W = W, b = b, n_out=n_outs) self.params.extend(self.logLayer.params) self.L1 += (abs(self.logLayer.W).sum()) self.L2_sqr += ((self.logLayer.W ** 2).sum()) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = (self.logLayer.negative_log_likelihood(self.y) + + L1_reg * self.L1 + L2_reg * self.L2_sqr ) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.predprobs = self.logLayer.p_y_given_x self.preds = self.logLayer.y_pred def pretraining_functions(self, train_set_x, batch_size, cdk, usepersistent): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param cdk: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error if usepersistent: # init persisent chain persistent_chain = theano.shared(numpy.zeros((batch_size, rbm.n_hidden), dtype=theano.config.floatX), borrow=True) cost, updates = rbm.get_cost_updates(learning_rate, persistent=persistent_chain, k=cdk) else: cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=cdk) # compile the theano function fn = theano.function( inputs=[index, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters #gparams = T.grad(self.finetune_cost, self.params) gparams = [T.grad(self.finetune_cost, param) for param in self.params] # compute list of fine-tuning updates #updates = [] #for param, gparam in zip(self.params, gparams): # updates.append((param, param - gparam * learning_rate)) updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] } ) train_score_i = theano.function( [index], self.errors, givens={ self.x: train_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: train_set_y[ index * batch_size: (index + 1) * batch_size ] } ) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: test_set_y[ index * batch_size: (index + 1) * batch_size ] } ) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[ index * batch_size: (index + 1) * batch_size ], self.y: valid_set_y[ index * batch_size: (index + 1) * batch_size ] } ) # Create a function that scans the entire test set def train_score(): return [train_score_i(i) for i in xrange(n_train_batches)] # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, train_score, valid_score, test_score
class MixtureOfExperts: """ The Mixture of Experts model""" def __init__(self, input_dim, nb_experts, output_dim): self.nb_experts = nb_experts self.output_dim = output_dim self.gates = LogisticRegression(input_dim, nb_experts) self.experts = [ LogisticRegression(input_dim, output_dim) for k in range(nb_experts) ] def pz_given_x(self, x): return self.gates.py_given_x(x) def py_given_x(self, x): pz = self.gates.py_given_x(x) py = np.zeros((x.shape[0], self.output_dim)) for z in range(self.nb_experts): pzb = np.tile(np.expand_dims(pz[:, z], axis=1), (1, self.output_dim)) py += pzb * self.experts[z].py_given_x(x) return py def py_given_xz(self, x, z): return self.experts[z].py_given_x(x) def lik_y_for_every_z(self, x, y): py = np.zeros((x.shape[0], self.nb_experts)) for z in range(self.nb_experts): py[:, z] = np.sum(y * self.py_given_xz(x, z), axis=1) return py def pz_given_xy(self, x, y): pz_given_x = self.pz_given_x(x) lik_y_forallz = self.lik_y_for_every_z(x, y) pz_given_xy = lik_y_forallz * pz_given_x renorm = np.tile(np.expand_dims(np.sum(pz_given_xy, axis=1), axis=1), (1, pz_given_xy.shape[1])) pz_given_xy = pz_given_xy / renorm return pz_given_xy def sample_y_given_x(self, x): py = self.py_given_x(x) y = np.array( [np.random.multinomial(1, py[i, :]) for i in range(x.shape[0])]) return y def log_likelihood(self, x, y): lik_y = self.lik_y_for_every_z(x, y) pz_given_x = self.pz_given_x(x) return np.sum(np.log(np.sum(pz_given_x * lik_y, 1))) def fit(self, x, y, method='CG', max_iter=15): """ The model is trained using Generalized Expectation-Maximization. In the Maximization step the Conjugate-Gradient algorithm provided by scipy.optimize is used by default. """ if type(y) == type([]): y = np.eye(self.output_dim)[y] ll, Q1, Q2 = expectation_maximization2(self, x, y, max_iter=max_iter) return ll, Q1, Q2
class DBN(object): """Deep Belief Network A deep belief network is obtained by stacking several RBMs on top of each other. The hidden layer of the RBM at layer `i` becomes the input of the RBM at layer `i+1`. The first layer RBM gets as input the input of the network, and the hidden layer of the last RBM represents the output. When used for classification, the DBN is treated as a MLP, by adding a logistic regression layer on top. """ def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, L1_reg=0, L2_reg=0, first_layer='grbm', model=None): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.L1 = 0 self.L2_sqr = 0 assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[i - 1].output if model is None: W = None b = None else: W = model[i * 2] b = model[i * 2 + 1] sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], W=W, b=b, activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.L1 += (abs(sigmoid_layer.W).sum()) self.L2_sqr += ((sigmoid_layer.W**2).sum()) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer if i == 0: # first layer GBRBM - dealing with continous value if first_layer == 'grbm': rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) if first_layer == 'rbm': rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) # elif i == self.n_layers-1: # last layer GGRBM # rbm_layer = GRBM(numpy_rng=numpy_rng, # theano_rng=theano_rng, # input=layer_input, # n_visible=input_size, # n_hidden=hidden_layers_sizes[i], # W=sigmoid_layer.W, # hbias=sigmoid_layer.b) else: # subsequence layers BBRBM - binary RBM to cope with regularization rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if model is None: W = None b = None else: W = model[-2] b = model[-1] self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], W=W, b=b, n_out=n_outs) self.params.extend(self.logLayer.params) self.L1 += (abs(self.logLayer.W).sum()) self.L2_sqr += ((self.logLayer.W**2).sum()) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = (self.logLayer.negative_log_likelihood(self.y) + +L1_reg * self.L1 + L2_reg * self.L2_sqr) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.predprobs = self.logLayer.p_y_given_x self.preds = self.logLayer.y_pred def pretraining_functions(self, train_set_x, batch_size, cdk, usepersistent): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param cdk: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch bc_idx = T.ivector('bc_idx') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error if usepersistent: # init persisent chain persistent_chain = theano.shared(numpy.zeros( (batch_size, rbm.n_hidden), dtype=theano.config.floatX), borrow=True) cost, updates = rbm.get_cost_updates( learning_rate, persistent=persistent_chain, k=cdk) else: cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=cdk) # compile the theano function fn = theano.function( inputs=[bc_idx, theano.Param(learning_rate, default=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[bc_idx]}) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch bc_idx = T.ivector('bc_idx') # compute the gradients with respect to the model parameters #gparams = T.grad(self.finetune_cost, self.params) gparams = [T.grad(self.finetune_cost, param) for param in self.params] # compute list of fine-tuning updates #updates = [] #for param, gparam in zip(self.params, gparams): # updates.append((param, param - gparam * learning_rate)) updates = [(param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams)] train_fn = theano.function(inputs=[bc_idx], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[bc_idx], self.y: train_set_y[bc_idx] }) train_score_i = theano.function( [index], self.errors, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # Create a function that scans the entire test set def train_score(): return [train_score_i(i) for i in xrange(n_train_batches)] # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, train_score, valid_score, test_score
def __init__(self, numpy_rng, theano_rng=None, n_in=784, hidden_layers_sizes=[500, 500], n_out=10,corruption_levels=[0.,0.1]): """This class is made to support a variable number of layers. :type theano_rng: theano.tensor.shared_randomstream.RandomSteam :param theano_rng: Thenao random generator used to draw initial weights :type n_in: int :param n_in: dimension of the input to the sdA :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_out: int :param n_out: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ # self.dA_layers will store the denoising autoencoder associated # with the layers of the MLP self.dA_layers = [] # self.sigmoid_layers will store the sigmoid layers of the MLP facade self.sigmoid_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasteried images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # # Construct self.n_layers sigmoid layers and self.n_layers denoising # layersm where self.n_layers is the depth of our model # for i in range(self.n_layers): # construct a sigmoid layer # # the size of the input is ethier the number of the hidden units of # the layer below or the input size if we are on the first layer. # the input of the layer has the same situation if i == 0: input_size = n_in layer_input = self.x else: input_size = hidden_layers_sizes[i-1] layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layer self.sigmoid_layers.append(sigmoid_layer) # ??? the parameters of the sigmoid layers are paremeters of the # sdA, the visible bias in the dA are parameters of those # dA, but not the sdA. So we do not add the dA_layer's (below) # bvis to self.params. self.params.extend(sigmoid_layer.params) # construct a denoising autoencoder that shared weights with this # sigmoid_layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W = sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # # Construct a logistic layer on top of the MLP # self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_out ) self.params.extend(self.logLayer.params) # # Construct a function that impletements one step of finetuning # # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50],batch_size=500): """Demonstates lenet on MNIST dataset """ rng = numpy.random.RandomState(1234) print('Loading Data'+'.'*20) datasets = load_data(dataset) trainSetX, trainSetY = datasets[0] validSetX, validSetY = datasets[1] testSetX, testSetY = datasets[2] n_train_batches = trainSetX.get_value(borrow=True).shape[0] // batch_size n_valid_batches = validSetX.get_value(borrow=True).shape[0] // batch_size n_test_batches = testSetX.get_value(borrow=True).shape[0] // batch_size print('Building Data'+'.'*20) index = T.lscalar('index') x = T.matrix('x') y = T.ivector('y') # Reshape matrix of rasterized images of shape (batch_size, 28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size,1,28,28)) # construct the first convolutional pooling layer # filtering reduces the image size to (28-5+1,28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer( rng=rng, input=layer0_input, image_shape=(batch_size,1,28,28), filter_shape=(nkerns[0],1,5,5), poolsize=(2,2) ) # construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size,nkerns[1],4,4) layer1 = LeNetConvPoolLayer( rng=rng, input=layer0.output, image_shape=(batch_size,nkerns[0],12,12), filter_shape=(nkerns[1],nkerns[0],5,5), poolsize=(2,2) ) layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer( rng=rng, input=layer2_input, n_in=nkerns[1]*4*4, n_out=500, activation=T.tanh ) layer3 = LogisticRegression(input=layer2.output,n_in=500,n_out=10) testModel = theano.function( inputs=[index], outputs=layer3.errors(y), givens={ x:testSetX[index*batch_size:(index+1)*batch_size], y:testSetY[index*batch_size:(index+1)*batch_size] } ) validModel = theano.function( inputs=[index], outputs=layer3.errors(y), givens={ x:validSetX[index*batch_size:(index+1)*batch_size], y:validSetY[index*batch_size:(index+1)*batch_size] } ) params = layer3.params+layer2.params+layer1.params+layer0.params cost = layer3.negative_log_likelihood(y) grads = T.grad(cost,params) updates= [(param, param - learning_rate*grad) for param,grad in zip(params,grads) ] trainModel = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x:trainSetX[index*batch_size:(index+1)*batch_size], y:trainSetY[index*batch_size:(index+1)*batch_size] } ) print('Training'+'.'*20) patience = 10000 patience_increase = 2 improvement_threshold = 2 validation_frequence = min(n_train_batches, patience/2) best_validation_loss = numpy.inf best_iter = 0 test_score =0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 for mini_batch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + mini_batch_index if iter % 100 == 0: print('training @ iter = ' , iter) cost_ij = trainModel(mini_batch_index) if (iter + 1) % validation_frequence ==0: validation_losses = [validModel(i) for i in range(n_valid_batches)] this_validation_losses = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, mini_batch_index+1, n_train_batches, this_validation_losses*100) ) if this_validation_losses < best_validation_loss: best_validation_loss = this_validation_losses best_iter = iter if this_validation_losses < best_validation_loss * \ improvement_threshold: patience = max(patience, patience*patience_increase) test_losses = [testModel(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print(' epoch %i, minibatch %i/%i, test error of' 'best model %f %%'% (epoch, mini_batch_index+1, n_train_batches, this_validation_losses*100) ) if patience <= iter: done_looping = True break endtime = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' ' with test performance %f %%' % (best_validation_loss*100, best_iter+1, test_score*100.) ) print('The code for file ' + os.path.split(__file__)[1]+ ' ran for %.2fm' % (endtime - start_time)/60. )
class DBN(object): def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = theano_rng = RandomStreams(numpy_rng.randint(2**30)) self.x = T.matrix('x') self.y = T.ivector('y') for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likehood(self.y) self.errors = self.logLayer.errors(self.y) def pretraining_functions(self, train_set_x, batch_size, k): index = T.lscalar('index') learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) fn = theano.function( inputs=[index, theano.In(learning_rate, value=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): (train_set_x, train_set_y) = datasets[0] (test_set_x, test_set_y) = datasets[1] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches //= batch_size index = T.lscalar('index') # index to a [mini]batch gparams = T.grad(self.finetune_cost, self.params) updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }) def test_score(): return [test_score_i(i) for i in range(n_test_batches)] valid_score = None return train_fn, valid_score, test_score def predict(self): test_set = logistic.load_my_test_data() test_set_x = test_set.get_value() predict_model = theano.function(inputs=[self.x], outputs=self.logLayer.y_pred) predicted_values = predict_model(test_set_x) ids = numpy.arange(predicted_values.shape[0] + 1) print ids.dtype print predicted_values df = pd.DataFrame({"ImageId": ids[1:], "Label": predicted_values}) print df df.to_csv('submission.csv', index=False, index_label=True)