x = T.matrix('x') # the data is presented as a vector of inputs with many exchangeable examples of this vector x = clip_gradient(x,1.0) is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction rng = numpy.random.RandomState(1234) # The poisson regression layer gets as input the hidden units # of the hidden layer n_hidden = 400; lstm_1 = LSTM(rng, x, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) lstm_2 = LSTM(rng, lstm_1.output, n_in=n_hidden, n_out=n_hidden-200) output = LinearRegression(input=lstm_2.output, n_in=n_hidden-200, n_out=data_set_x.get_value(borrow=True).shape[1]) ################################################ # Load learned params ################################################ f = file(params_file, 'rb') old_p = cPickle.load(f) f.close() lstm_1.W_i.set_value(old_p[0].get_value(), borrow=True) lstm_1.W_f.set_value(old_p[1].get_value(), borrow=True) lstm_1.W_c.set_value(old_p[2].get_value(), borrow=True) lstm_1.W_o.set_value(old_p[3].get_value(), borrow=True) lstm_1.U_i.set_value(old_p[4].get_value(), borrow=True)
# Architecture: input --> LSTM --> predict one-ahead n_hidden = 1000 lstm_1 = LSTM(rng, x, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) lstm_2 = LSTM(rng, lstm_1.output, n_in=n_hidden, n_out=n_hidden) lstm_3 = LSTM(rng, lstm_2.output, n_in=n_hidden, n_out=n_hidden) output = LinearRegression(input=lstm_3.output, n_in=n_hidden, n_out=data_set_x.get_value(borrow=True).shape[1]) ################################ # Objective function and GD ################################ print 'defining cost, parameters, and learning function...' # the cost we minimize during training is the negative log likelihood of # the model cost = T.mean(output.negative_log_likelihood(y)) #Defining params params = lstm_1.params + lstm_2.params + lstm_3.params + output.params
n_hidden = 300; lstm_1 = LSTM(rng, x, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) lstm_2 = LSTM(rng, lstm_1.output, n_in=n_hidden, n_out=n_hidden-100) lstm_3 = LSTM(rng, lstm_2.output, n_in=n_hidden-100, n_out=n_hidden-200) #lstm_1 = RNN(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #vanilla rnn #lstm_1 = hybridRNN(rng, x, n_in=data_set_x.get_value(borrow=True).shape[1], n_lstm=nn_lstm, n_rnn=nn_rnn) #each type must have at least 2 #lstm_2 = hybridRNN(rng, lstm_1.output, n_in=n_hidden, n_lstm=nn_lstm, n_rnn=nn_rnn) #each type must have at least 2 #lstm_1 = IRNN(rng, x, n_in=data_set_x.get_value(borrow=True).shape[1], n_out = n_hidden) #each type must have at least 2 #lstm_2 = IRNN(rng, lstm_1.output, n_in=n_hidden, n_out=n_hidden) #each type must have at least 2 #d_lstm_1 = Dropout(rng, is_train, lstm_1.output) output = LinearRegression(input=lstm_3.output, n_in=n_hidden-200, n_out=data_set_x.get_value(borrow=True).shape[1]) savefilename = '/vega/stats/users/sl3368/rnn_code/saves/params/lc_1_10_lowpad_LSTM_triple_300_2nd.save' ####################### # Objective function ####################### print '... defining objective and compiling test and validate' # the cost we minimize during training is the negative log likelihood of # the model cost = T.mean(output.negative_log_likelihood(y)) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch
print 'Building model...' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as a vector of inputs with many exchangeable examples of this vector y = T.imatrix('y') # the data is presented as a vector of inputs with many exchangeable examples of this vector ahead = T.matrix('ahead') sent = T.matrix('sentence') phonemes = T.imatrix('phonemes') rng = numpy.random.RandomState(1234) init_reg = LinearRegression(x, 60, 30,True) lstm_1 = LSTM(rng,init_reg.E_y_given_x,30,lstm_1_hidden) lstm_2 = LSTM(rng,lstm_1.output,lstm_1_hidden,lstm_2_hidden) reg_input = lstm_2.output #need log_reg and cross covariate layers log_reg = LogisticRegression(reg_input,lstm_2_hidden, 41) #lin_reg = LinearRegression(reg_input,lstm_2_hidden,1,True) log_reg.reconstruct(log_reg.p_y_given_x) #lin_reg.reconstruct(lin_reg.E_y_given_x)
def SGD_training(learning_rate=1, n_epochs=1000): """ stochastic gradient descent optimization """ dataset_info = load_all_data() data_set_x = dataset_info[0] maxBatchSize = numpy.int_(dataset_info[1]) batch_size = maxBatchSize n_train_batches = 28 #n_valid_batches = 1 #n_test_batches = 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix( 'x' ) # the data is presented as a vector of inputs with many exchangeable examples of this vector x = clip_gradient(x, 1.0) y = T.matrix( 'y' ) # the data is presented as a vector of inputs with many exchangeable examples of this vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction rng = numpy.random.RandomState(1234) ################################################ # Architecture: input --> LSTM --> predict one-ahead ################################################ # The poisson regression layer gets as input the hidden units # of the hidden layer d_input = Dropout(rng, is_train, x) n_hidden = 100 lstm_1 = LSTM(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #lstm_1 = RNN(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #vanilla rnn d_lstm_1 = Dropout(rng, is_train, lstm_1.output) output = LinearRegression(input=d_lstm_1.output, n_in=n_hidden, n_out=data_set_x.get_value(borrow=True).shape[1]) ####################### # Objective function ####################### print '... defining objective and compiling test and validate' # the cost we minimize during training is the negative log likelihood of # the model cost = T.mean(output.negative_log_likelihood(y)) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch # use cost or errors(y,tc,md) as output? test_model = theano.function( inputs=[index], outputs=[cost, output.E_y_given_x], givens={ x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)], y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size], is_train: numpy.cast['int32'](0) }) # wanted to use below indexes and have different sized batches, but this didn't work #[int(batchBreaks[index]-1):int(batchBreaks[(index+1)]-1)] validate_model = theano.function( inputs=[index], outputs=cost, givens={ x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)], y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size], is_train: numpy.cast['int32'](0) }) ####################### # Parameters and gradients ####################### print '... parameters and gradients' # create a list (concatenated) of all model parameters to be fit by gradient descent #order: [self.W, self.b] params = lstm_1.params + output.params params_helper = lstm_1.params_helper + output.params_helper params_helper2 = lstm_1.params_helper2 + output.params_helper2 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs updates = [] # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] #for param, gparam in zip(params, gparams): # updates.append((param, param - learning_rate * gparam)) #iter_count = theano.shared(1) #L1_penalized = [] #larger_stepsize = [] #enforce_positive = [2, 3] #if recurrent #enforce_positive = [] #zero_stepsize = [] param_index = 0 #rho = 1e-6 #for param, param_helper, param_helper2, gparam in zip(params, params_helper, params_helper2, gparams): #updates.append((param_helper, param_helper + gparam ** 2)) #need sum of squares for learning rate #updates.append((param_helper2, param_helper2 + gparam)) #need sum of gradients for L1 thresholding #vanilla SGD #for param, gparam in zip(params, gparams): # updates.append((param, param - learning_rate * gparam)) # param_index += 1 #adadelta updates rho = .95 eps_big = 1e-6 for param, param_helper, param_helper2, gparam in zip( params, params_helper, params_helper2, gparams): updates.append( (param_helper, rho * param_helper + (1. - rho) * (gparam**2))) #update decaying sum of previous gradients dparam = -T.sqrt( (param_helper2 + eps_big) / (rho * param_helper + (1. - rho) * (gparam**2) + eps_big)) * gparam # calculate step size updates.append( (param_helper2, rho * param_helper2 + (1. - rho) * (dparam**2))) #update decaying sum of previous step sizes updates.append((param, param + dparam)) #updates.append((iter_count, iter_count + 1)) print '... compiling train' # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)], y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size], is_train: numpy.cast['int32'](0) }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 5000 # look as this many examples regardless #patience = train_set_x.get_value(borrow=True).shape[0] * n_epochs #no early stopping patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.99 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch #best_params = None best_validation_loss = numpy.inf best_iter = 0 #test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) print minibatch_avg_cost # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute absolute error loss on validation set validation_losses = [validate_model(i) for i in [28]] this_validation_loss = numpy.mean( validation_losses) #mean over batches print('epoch %i, minibatch %i, validation error %f' % (epoch, minibatch_index + 1, this_validation_loss)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set #test_losses = [test_model(i) for i # in [29]] #test_score = numpy.mean(test_losses) test_cost, test_pred = test_model(29) #test_cost, test_costs_separate, test_pred_separate, test_actual_separate = test_model(29) print((' epoch %i, minibatch %i, test error of ' 'best model %f') % (epoch, minibatch_index + 1, numpy.sum(test_cost))) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f' 'obtained at iteration %i, with test performance %f') % (best_validation_loss, best_iter + 1, numpy.sum(test_cost))) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) #store data f = file('results/params.save', 'wb') for obj in [params + [test_cost] + [test_pred]]: cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close()
def SGD_training(learning_rate=1, n_epochs=1000): """ stochastic gradient descent optimization """ dataset_info = load_all_data() data_set_x = dataset_info[0] maxBatchSize = numpy.int_(dataset_info[1]) batch_size = maxBatchSize n_train_batches = 28 #n_valid_batches = 1 #n_test_batches = 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as a vector of inputs with many exchangeable examples of this vector x = clip_gradient(x,1.0) y = T.matrix('y') # the data is presented as a vector of inputs with many exchangeable examples of this vector is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction rng = numpy.random.RandomState(1234) ################################################ # Architecture: input --> LSTM --> predict one-ahead ################################################ # The poisson regression layer gets as input the hidden units # of the hidden layer d_input = Dropout(rng, is_train, x) n_hidden = 100 lstm_1 = LSTM(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #lstm_1 = RNN(rng, d_input.output, n_in=data_set_x.get_value(borrow=True).shape[1], n_out=n_hidden) #vanilla rnn d_lstm_1 = Dropout(rng, is_train, lstm_1.output) output = LinearRegression(input=d_lstm_1.output, n_in=n_hidden, n_out=data_set_x.get_value(borrow=True).shape[1]) ####################### # Objective function ####################### print '... defining objective and compiling test and validate' # the cost we minimize during training is the negative log likelihood of # the model cost = T.mean(output.negative_log_likelihood(y)) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch # use cost or errors(y,tc,md) as output? test_model = theano.function(inputs=[index], outputs=[cost, output.E_y_given_x], givens={ x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)], y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size], is_train: numpy.cast['int32'](0)}) # wanted to use below indexes and have different sized batches, but this didn't work #[int(batchBreaks[index]-1):int(batchBreaks[(index+1)]-1)] validate_model = theano.function(inputs=[index], outputs=cost, givens={ x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)], y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size], is_train: numpy.cast['int32'](0)}) ####################### # Parameters and gradients ####################### print '... parameters and gradients' # create a list (concatenated) of all model parameters to be fit by gradient descent #order: [self.W, self.b] params = lstm_1.params + output.params params_helper = lstm_1.params_helper + output.params_helper params_helper2 = lstm_1.params_helper2 + output.params_helper2 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs updates = [] # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] #for param, gparam in zip(params, gparams): # updates.append((param, param - learning_rate * gparam)) #iter_count = theano.shared(1) #L1_penalized = [] #larger_stepsize = [] #enforce_positive = [2, 3] #if recurrent #enforce_positive = [] #zero_stepsize = [] param_index = 0 #rho = 1e-6 #for param, param_helper, param_helper2, gparam in zip(params, params_helper, params_helper2, gparams): #updates.append((param_helper, param_helper + gparam ** 2)) #need sum of squares for learning rate #updates.append((param_helper2, param_helper2 + gparam)) #need sum of gradients for L1 thresholding #vanilla SGD #for param, gparam in zip(params, gparams): # updates.append((param, param - learning_rate * gparam)) # param_index += 1 #adadelta updates rho = .95 eps_big = 1e-6 for param, param_helper, param_helper2, gparam in zip(params, params_helper, params_helper2, gparams): updates.append((param_helper,rho * param_helper + (1. - rho) * (gparam ** 2))) #update decaying sum of previous gradients dparam = - T.sqrt((param_helper2 + eps_big) / (rho * param_helper + (1. - rho) * (gparam ** 2) + eps_big)) *gparam # calculate step size updates.append((param_helper2, rho * param_helper2 + (1. - rho) * (dparam ** 2))) #update decaying sum of previous step sizes updates.append((param, param + dparam)) #updates.append((iter_count, iter_count + 1)) print '... compiling train' # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: data_set_x[index * batch_size:((index + 1) * batch_size - 1)], y: data_set_x[(index * batch_size + 1):(index + 1) * batch_size], is_train: numpy.cast['int32'](0)}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 5000 # look as this many examples regardless #patience = train_set_x.get_value(borrow=True).shape[0] * n_epochs #no early stopping patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.99 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch #best_params = None best_validation_loss = numpy.inf best_iter = 0 #test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) print minibatch_avg_cost # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute absolute error loss on validation set validation_losses = [validate_model(i) for i in [28]] this_validation_loss = numpy.mean(validation_losses) #mean over batches print('epoch %i, minibatch %i, validation error %f' % (epoch, minibatch_index + 1, this_validation_loss)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set #test_losses = [test_model(i) for i # in [29]] #test_score = numpy.mean(test_losses) test_cost, test_pred = test_model(29) #test_cost, test_costs_separate, test_pred_separate, test_actual_separate = test_model(29) print((' epoch %i, minibatch %i, test error of ' 'best model %f') % (epoch, minibatch_index + 1, numpy.sum(test_cost))) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f' 'obtained at iteration %i, with test performance %f') % (best_validation_loss, best_iter + 1, numpy.sum(test_cost))) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) #store data f = file('results/params.save', 'wb') for obj in [params + [test_cost] + [test_pred]]: cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close()
# allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix( 'x' ) # the data is presented as a vector of inputs with many exchangeable examples of this vector y = T.imatrix( 'y' ) # the data is presented as a vector of inputs with many exchangeable examples of this vector ahead = T.matrix('ahead') sent = T.matrix('sentence') phonemes = T.imatrix('phonemes') rng = numpy.random.RandomState(1234) init_reg = LinearRegression(x, 1, 30, True) lstm_1 = LSTM(rng, init_reg.E_y_given_x, 30, lstm_1_hidden) lstm_2 = LSTM(rng, lstm_1.output, lstm_1_hidden, lstm_2_hidden) reg_input = lstm_2.output #need log_reg and cross covariate layers log_reg = LogisticRegression(reg_input, lstm_2_hidden, 41) #lin_reg = LinearRegression(reg_input,lstm_2_hidden,1,True) log_reg.reconstruct(log_reg.p_y_given_x) #lin_reg.reconstruct(lin_reg.E_y_given_x)
log_reg = LogisticRegression(reg_input, 15 * 15 * layer3_filters, 41) lin_reg = LinearRegressionRandom(reg_input, 15 * 15 * layer3_filters, 2, True) log_input = log_reg.p_y_given_x lin_input = lin_reg.E_y_given_x log_reg.reconstruct(log_input) lin_reg.reconstruct(lin_input) reconstructed_regressions = T.concatenate( [log_reg.reconstructed_x, lin_reg.reconstructed_x], axis=1) reverse_layer = LinearRegression(reconstructed_regressions, 2 * 15 * 15 * layer3_filters, 15 * 15 * layer3_filters, False) reconstruct = reverse_layer.E_y_given_x.reshape( (minibatch_size, layer3_filters, 15, 15)) layer3.reverseConv(reconstruct, (minibatch_size, layer3_filters, 15, 15), (layer2_filters, layer3_filters, 2, 2)) layer2.reverseConv(layer3.reverseOutput, (minibatch_size, layer2_filters, 15, 15), (layer1_filters, layer2_filters, 2, 2)) layer1.reverseConv(layer2.reverseOutput, (minibatch_size, layer1_filters, 30, 30), (layer0_filters, layer1_filters, 2, 2))